From 535d55c2dbb8cddec9a7fac6066cc8e1afa8ae5c Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Sat, 25 Apr 2026 00:58:53 +0900 Subject: [PATCH 01/15] admin: implement P1 foundation (auth, router, cluster, listener) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lays down the read-only slice of the admin dashboard per docs/design/2026_04_24_proposed_admin_dashboard.md. No write endpoints yet — the 3.3.2 acceptance criteria still block those, and they ship together with AdminForward in a follow-up. internal/admin: - Config validation: hard startup failure on missing signing key, non-loopback without TLS, duplicate role assignments, wrong-length HS256 keys. - JWT signer/verifier: HS256, 1h TTL, accepts primary + previous key for rotation, rejects expired / future-dated / unknown-role tokens. - Router with strict prefix ordering so /admin/api/v1/* and /admin/healthz are never shadowed by the SPA fallback. - Middleware: 64 KiB BodyLimit + MaxBytesError translation, SessionAuth, RequireWriteRole, CSRF double-submit, slog-backed admin_audit logger for write paths. - Login / logout: Content-Type and per-IP 5/min rate limit, constant-time credential comparison, HttpOnly+Secure+SameSite=Strict session cookie + separate SPA-readable CSRF cookie on success, explicit 403 when a valid access key is not in the admin role table. - Cluster + healthz handlers behind a small ClusterInfoSource interface so the handler stays tested without main.go types. - Server ties everything together; NewServer returns a hard error on missing dependencies so misuse is caught at startup. main_admin.go + flags: - New admin-* CLI flags mirroring the Config surface. - startAdminServer validates config, builds the signer/verifier, loads the existing S3 credential map as the admin credential store, and attaches the server lifecycle to errgroup with a clean shutdown path. - ClusterInfoSource reads leader/state from raftGroupRuntime. Tests: happy paths + every failure branch across config, JWT, router, middleware, auth, cluster, and an in-process main_admin integration test that boots the real listener over plaintext and TLS. golangci-lint clean. go test ./... green. --- internal/admin/auth_handler.go | 268 ++++++++++++++++++++++++ internal/admin/auth_handler_test.go | 273 +++++++++++++++++++++++++ internal/admin/cluster_handler.go | 76 +++++++ internal/admin/cluster_handler_test.go | 77 +++++++ internal/admin/config.go | 226 ++++++++++++++++++++ internal/admin/config_test.go | 203 ++++++++++++++++++ internal/admin/jwt.go | 208 +++++++++++++++++++ internal/admin/jwt_test.go | 184 +++++++++++++++++ internal/admin/middleware.go | 240 ++++++++++++++++++++++ internal/admin/middleware_test.go | 238 +++++++++++++++++++++ internal/admin/principal.go | 28 +++ internal/admin/ratelimit.go | 89 ++++++++ internal/admin/router.go | 186 +++++++++++++++++ internal/admin/router_test.go | 142 +++++++++++++ internal/admin/server.go | 168 +++++++++++++++ internal/admin/server_test.go | 187 +++++++++++++++++ main.go | 15 ++ main_admin.go | 259 +++++++++++++++++++++++ main_admin_test.go | 259 +++++++++++++++++++++++ 19 files changed, 3326 insertions(+) create mode 100644 internal/admin/auth_handler.go create mode 100644 internal/admin/auth_handler_test.go create mode 100644 internal/admin/cluster_handler.go create mode 100644 internal/admin/cluster_handler_test.go create mode 100644 internal/admin/config.go create mode 100644 internal/admin/config_test.go create mode 100644 internal/admin/jwt.go create mode 100644 internal/admin/jwt_test.go create mode 100644 internal/admin/middleware.go create mode 100644 internal/admin/middleware_test.go create mode 100644 internal/admin/principal.go create mode 100644 internal/admin/ratelimit.go create mode 100644 internal/admin/router.go create mode 100644 internal/admin/router_test.go create mode 100644 internal/admin/server.go create mode 100644 internal/admin/server_test.go create mode 100644 main_admin.go create mode 100644 main_admin_test.go diff --git a/internal/admin/auth_handler.go b/internal/admin/auth_handler.go new file mode 100644 index 00000000..abfce86a --- /dev/null +++ b/internal/admin/auth_handler.go @@ -0,0 +1,268 @@ +package admin + +import ( + "crypto/rand" + "crypto/subtle" + "encoding/base64" + "io" + "net/http" + "strings" + "time" + + "github.com/cockroachdb/errors" + "github.com/goccy/go-json" +) + +// CredentialStore is the read-side view of the static SigV4 credential +// table the server was configured with. It returns the secret for a +// given access key, or ("", false) if the key is unknown. Supplying the +// same map the S3/DynamoDB adapters use keeps authentication consistent +// across the protocol surface. +type CredentialStore interface { + LookupSecret(accessKey string) (string, bool) +} + +// MapCredentialStore adapts a plain map into the CredentialStore +// interface. Callers typically load this from config at startup and +// hand the same map to the S3 adapter and the admin service. +type MapCredentialStore map[string]string + +// LookupSecret implements CredentialStore. +func (m MapCredentialStore) LookupSecret(accessKey string) (string, bool) { + secret, ok := m[strings.TrimSpace(accessKey)] + return secret, ok +} + +// AuthService wires the login/logout handlers, token minting, role +// lookup, and per-IP rate limiter together. Construct it once at +// startup and reuse across the admin listener's lifetime. +type AuthService struct { + signer *Signer + creds CredentialStore + roles map[string]Role + limiter *rateLimiter + sessionTTL time.Duration + secureCookie bool + cookieDomain string +} + +// AuthServiceOpts covers the knobs a caller may want to vary in tests. +// Zero values fall back to production defaults. +type AuthServiceOpts struct { + // InsecureCookie disables the Secure attribute on the issued + // cookies. It exists only for local plaintext-loopback development + // and is expected to stay false in any real deployment. + InsecureCookie bool + // CookieDomain is optional and rarely used. Empty means "host-only + // cookie", which is the default and the safest choice. + CookieDomain string + // LoginLimit is the per-IP rate limit (default 5). + LoginLimit int + // LoginWindow is the rate-limit window (default 1 minute). + LoginWindow time.Duration + // Clock drives rate-limiter aging. Defaults to SystemClock. + Clock Clock +} + +// NewAuthService constructs an AuthService. The signer must be primary +// (use NewSigner with the current key); token verification uses the +// Verifier passed separately to SessionAuth. +func NewAuthService(signer *Signer, creds CredentialStore, roles map[string]Role, opts AuthServiceOpts) *AuthService { + limit := opts.LoginLimit + if limit <= 0 { + limit = 5 + } + window := opts.LoginWindow + if window <= 0 { + window = time.Minute + } + if opts.Clock == nil { + opts.Clock = SystemClock + } + return &AuthService{ + signer: signer, + creds: creds, + roles: roles, + limiter: newRateLimiter(limit, window, opts.Clock), + sessionTTL: sessionTTL, + secureCookie: !opts.InsecureCookie, + cookieDomain: opts.CookieDomain, + } +} + +// loginRequest is the JSON body the login endpoint accepts. +type loginRequest struct { + AccessKey string `json:"access_key"` + SecretKey string `json:"secret_key"` +} + +// loginResponse is the JSON body the login endpoint returns on success. +// The CSRF token is also readable from the admin_csrf cookie; we include +// it here as a convenience for clients that want to avoid parsing the +// Set-Cookie header themselves. +type loginResponse struct { + Role Role `json:"role"` + ExpiresAt time.Time `json:"expires_at"` +} + +// HandleLogin validates credentials and issues the session + CSRF cookies. +// It is safe to expose without the SessionAuth middleware because this is +// where a session first comes from; rate limiting, Content-Type validation, +// and constant-time credential comparison guard it. +func (s *AuthService) HandleLogin(w http.ResponseWriter, r *http.Request) { + if !s.preflightLogin(w, r) { + return + } + req, ok := readLoginRequest(w, r) + if !ok { + return + } + principal, ok := s.authenticate(w, req) + if !ok { + return + } + s.issueSession(w, principal) +} + +func (s *AuthService) preflightLogin(w http.ResponseWriter, r *http.Request) bool { + if r.Method != http.MethodPost { + writeJSONError(w, http.StatusMethodNotAllowed, "method_not_allowed", "login requires POST") + return false + } + if !s.limiter.allow(clientIP(r)) { + w.Header().Set("Retry-After", "60") + writeJSONError(w, http.StatusTooManyRequests, "rate_limited", + "too many login attempts from this source; try again later") + return false + } + ct := strings.ToLower(strings.TrimSpace(r.Header.Get("Content-Type"))) + if !strings.HasPrefix(ct, "application/json") { + writeJSONError(w, http.StatusUnsupportedMediaType, "unsupported_media_type", + "login requires Content-Type: application/json") + return false + } + return true +} + +func readLoginRequest(w http.ResponseWriter, r *http.Request) (loginRequest, bool) { + raw, err := io.ReadAll(r.Body) + if err != nil { + if IsMaxBytesError(err) { + WriteMaxBytesError(w) + return loginRequest{}, false + } + writeJSONError(w, http.StatusBadRequest, "invalid_body", "failed to read body") + return loginRequest{}, false + } + var req loginRequest + if err := json.Unmarshal(raw, &req); err != nil { + writeJSONError(w, http.StatusBadRequest, "invalid_body", "body is not valid JSON") + return loginRequest{}, false + } + req.AccessKey = strings.TrimSpace(req.AccessKey) + req.SecretKey = strings.TrimSpace(req.SecretKey) + if req.AccessKey == "" || req.SecretKey == "" { + writeJSONError(w, http.StatusBadRequest, "missing_fields", + "access_key and secret_key are required") + return loginRequest{}, false + } + return req, true +} + +// dummySecretLen is the length we pad the timing-safe comparison with +// when the access key is unknown. It roughly matches the length of a +// typical AWS secret-access-key. +const dummySecretLen = 40 + +func (s *AuthService) authenticate(w http.ResponseWriter, req loginRequest) (AuthPrincipal, bool) { + expected, known := s.creds.LookupSecret(req.AccessKey) + if !known { + // Still compare against a dummy secret to keep timing + // roughly equivalent between known and unknown keys. + dummy := strings.Repeat("x", dummySecretLen) + _ = subtle.ConstantTimeCompare([]byte(req.SecretKey), []byte(dummy)) + writeJSONError(w, http.StatusUnauthorized, "invalid_credentials", + "access_key or secret_key is invalid") + return AuthPrincipal{}, false + } + if subtle.ConstantTimeCompare([]byte(req.SecretKey), []byte(expected)) != 1 { + writeJSONError(w, http.StatusUnauthorized, "invalid_credentials", + "access_key or secret_key is invalid") + return AuthPrincipal{}, false + } + role, ok := s.roles[req.AccessKey] + if !ok { + writeJSONError(w, http.StatusForbidden, "forbidden", + "access_key is not authorised for admin access") + return AuthPrincipal{}, false + } + return AuthPrincipal{AccessKey: req.AccessKey, Role: role}, true +} + +func (s *AuthService) issueSession(w http.ResponseWriter, principal AuthPrincipal) { + token, err := s.signer.Sign(principal) + if err != nil { + writeJSONError(w, http.StatusInternalServerError, "internal", "failed to mint session token") + return + } + csrf, err := newCSRFToken() + if err != nil { + writeJSONError(w, http.StatusInternalServerError, "internal", "failed to mint csrf token") + return + } + expires := time.Now().UTC().Add(s.sessionTTL) + http.SetCookie(w, s.buildCookie(sessionCookieName, token, true)) + http.SetCookie(w, s.buildCookie(csrfCookieName, csrf, false)) + w.Header().Set("Cache-Control", "no-store") + w.Header().Set("Content-Type", "application/json; charset=utf-8") + w.WriteHeader(http.StatusOK) + _ = json.NewEncoder(w).Encode(loginResponse{Role: principal.Role, ExpiresAt: expires}) +} + +// HandleLogout clears both cookies. It does not require authentication — +// clearing stale cookies after a session has expired is always safe. +func (s *AuthService) HandleLogout(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + writeJSONError(w, http.StatusMethodNotAllowed, "method_not_allowed", "logout requires POST") + return + } + http.SetCookie(w, s.buildExpiredCookie(sessionCookieName, true)) + http.SetCookie(w, s.buildExpiredCookie(csrfCookieName, false)) + w.Header().Set("Cache-Control", "no-store") + w.WriteHeader(http.StatusNoContent) +} + +func (s *AuthService) buildCookie(name, value string, httpOnly bool) *http.Cookie { + return &http.Cookie{ + Name: name, + Value: value, + Path: pathPrefixAdmin, + Domain: s.cookieDomain, + MaxAge: int(s.sessionTTL.Seconds()), + Secure: s.secureCookie, + HttpOnly: httpOnly, + SameSite: http.SameSiteStrictMode, + } +} + +func (s *AuthService) buildExpiredCookie(name string, httpOnly bool) *http.Cookie { + return &http.Cookie{ + Name: name, + Value: "", + Path: pathPrefixAdmin, + Domain: s.cookieDomain, + MaxAge: -1, + Expires: time.Unix(0, 0), + Secure: s.secureCookie, + HttpOnly: httpOnly, + SameSite: http.SameSiteStrictMode, + } +} + +func newCSRFToken() (string, error) { + var raw [32]byte + if _, err := rand.Read(raw[:]); err != nil { + return "", errors.Wrap(err, "read random bytes for csrf token") + } + return base64.RawURLEncoding.EncodeToString(raw[:]), nil +} diff --git a/internal/admin/auth_handler_test.go b/internal/admin/auth_handler_test.go new file mode 100644 index 00000000..95696c52 --- /dev/null +++ b/internal/admin/auth_handler_test.go @@ -0,0 +1,273 @@ +package admin + +import ( + "net/http" + "net/http/httptest" + "strings" + "testing" + "time" + + "github.com/goccy/go-json" + "github.com/stretchr/testify/require" +) + +func newAuthServiceForTest(t *testing.T) (*AuthService, *Verifier) { + t.Helper() + clk := fixedClock(time.Unix(1_700_000_000, 0).UTC()) + signer := newSignerForTest(t, 1, clk) + verifier := newVerifierForTest(t, []byte{1}, clk) + + creds := MapCredentialStore{ + "AKIA_ADMIN": "ADMIN_SECRET", + "AKIA_RO": "RO_SECRET", + "AKIA_OTHER": "OTHER_SECRET", // present in creds but not in roles + } + roles := map[string]Role{ + "AKIA_ADMIN": RoleFull, + "AKIA_RO": RoleReadOnly, + } + svc := NewAuthService(signer, creds, roles, AuthServiceOpts{ + Clock: clk, + }) + return svc, verifier +} + +func postJSON(t *testing.T, body any) *http.Request { + t.Helper() + buf, err := json.Marshal(body) + require.NoError(t, err) + req := httptest.NewRequest(http.MethodPost, "/admin/api/v1/auth/login", strings.NewReader(string(buf))) + req.Header.Set("Content-Type", "application/json") + req.RemoteAddr = "127.0.0.1:50001" + return req +} + +func TestLogin_HappyPathFull(t *testing.T) { + svc, verifier := newAuthServiceForTest(t) + req := postJSON(t, loginRequest{AccessKey: "AKIA_ADMIN", SecretKey: "ADMIN_SECRET"}) + rec := httptest.NewRecorder() + svc.HandleLogin(rec, req) + + require.Equal(t, http.StatusOK, rec.Code) + + var resp loginResponse + require.NoError(t, json.Unmarshal(rec.Body.Bytes(), &resp)) + require.Equal(t, RoleFull, resp.Role) + + // Find both cookies and assert attributes. + var session, csrf *http.Cookie + for _, c := range rec.Result().Cookies() { + switch c.Name { + case sessionCookieName: + session = c + case csrfCookieName: + csrf = c + } + } + require.NotNil(t, session, "expected admin_session cookie") + require.NotNil(t, csrf, "expected admin_csrf cookie") + + // Cookie hardening. + require.True(t, session.HttpOnly) + require.True(t, session.Secure) + require.Equal(t, http.SameSiteStrictMode, session.SameSite) + require.Equal(t, "/admin", session.Path) + require.Equal(t, int(sessionTTL.Seconds()), session.MaxAge) + + require.False(t, csrf.HttpOnly, "CSRF cookie must be readable by SPA") + require.True(t, csrf.Secure) + require.Equal(t, http.SameSiteStrictMode, csrf.SameSite) + require.Equal(t, "/admin", csrf.Path) + + // Token must verify. + principal, err := verifier.Verify(session.Value) + require.NoError(t, err) + require.Equal(t, "AKIA_ADMIN", principal.AccessKey) + require.Equal(t, RoleFull, principal.Role) +} + +func TestLogin_ReadOnlyMappedToRoleReadOnly(t *testing.T) { + svc, verifier := newAuthServiceForTest(t) + req := postJSON(t, loginRequest{AccessKey: "AKIA_RO", SecretKey: "RO_SECRET"}) + rec := httptest.NewRecorder() + svc.HandleLogin(rec, req) + require.Equal(t, http.StatusOK, rec.Code) + + var session *http.Cookie + for _, c := range rec.Result().Cookies() { + if c.Name == sessionCookieName { + session = c + } + } + require.NotNil(t, session) + principal, err := verifier.Verify(session.Value) + require.NoError(t, err) + require.Equal(t, RoleReadOnly, principal.Role) +} + +func TestLogin_WrongSecretRejected(t *testing.T) { + svc, _ := newAuthServiceForTest(t) + req := postJSON(t, loginRequest{AccessKey: "AKIA_ADMIN", SecretKey: "WRONG"}) + rec := httptest.NewRecorder() + svc.HandleLogin(rec, req) + require.Equal(t, http.StatusUnauthorized, rec.Code) + require.Contains(t, rec.Body.String(), "invalid_credentials") + // No cookies on failure. + require.Empty(t, rec.Result().Cookies()) +} + +func TestLogin_UnknownAccessKeyRejected(t *testing.T) { + svc, _ := newAuthServiceForTest(t) + req := postJSON(t, loginRequest{AccessKey: "AKIA_NOBODY", SecretKey: "anything"}) + rec := httptest.NewRecorder() + svc.HandleLogin(rec, req) + require.Equal(t, http.StatusUnauthorized, rec.Code) + require.Contains(t, rec.Body.String(), "invalid_credentials") +} + +func TestLogin_CredentialValidButNotAdminRejected(t *testing.T) { + svc, _ := newAuthServiceForTest(t) + // AKIA_OTHER is in creds but not in the role index. + req := postJSON(t, loginRequest{AccessKey: "AKIA_OTHER", SecretKey: "OTHER_SECRET"}) + rec := httptest.NewRecorder() + svc.HandleLogin(rec, req) + require.Equal(t, http.StatusForbidden, rec.Code) + require.Contains(t, rec.Body.String(), "forbidden") +} + +func TestLogin_MissingFields(t *testing.T) { + svc, _ := newAuthServiceForTest(t) + for _, body := range []loginRequest{ + {AccessKey: "", SecretKey: "x"}, + {AccessKey: "x", SecretKey: ""}, + {}, + } { + req := postJSON(t, body) + rec := httptest.NewRecorder() + svc.HandleLogin(rec, req) + require.Equal(t, http.StatusBadRequest, rec.Code) + } +} + +func TestLogin_RequiresJSON(t *testing.T) { + svc, _ := newAuthServiceForTest(t) + req := httptest.NewRequest(http.MethodPost, "/admin/api/v1/auth/login", + strings.NewReader("access_key=x&secret_key=y")) + req.Header.Set("Content-Type", "application/x-www-form-urlencoded") + req.RemoteAddr = "127.0.0.1:1" + rec := httptest.NewRecorder() + svc.HandleLogin(rec, req) + require.Equal(t, http.StatusUnsupportedMediaType, rec.Code) +} + +func TestLogin_OnlyPOST(t *testing.T) { + svc, _ := newAuthServiceForTest(t) + req := httptest.NewRequest(http.MethodGet, "/admin/api/v1/auth/login", nil) + req.RemoteAddr = "127.0.0.1:1" + rec := httptest.NewRecorder() + svc.HandleLogin(rec, req) + require.Equal(t, http.StatusMethodNotAllowed, rec.Code) +} + +func TestLogin_RateLimitPerIP(t *testing.T) { + svc, _ := newAuthServiceForTest(t) + // Configure is already default 5/min. + for i := 0; i < 5; i++ { + req := postJSON(t, loginRequest{AccessKey: "AKIA_ADMIN", SecretKey: "WRONG"}) + rec := httptest.NewRecorder() + svc.HandleLogin(rec, req) + require.Equalf(t, http.StatusUnauthorized, rec.Code, "attempt %d", i+1) + } + // 6th attempt from the same IP must be rate limited. + req := postJSON(t, loginRequest{AccessKey: "AKIA_ADMIN", SecretKey: "WRONG"}) + rec := httptest.NewRecorder() + svc.HandleLogin(rec, req) + require.Equal(t, http.StatusTooManyRequests, rec.Code) + require.Equal(t, "60", rec.Header().Get("Retry-After")) +} + +func TestLogin_RateLimitIsPerIPNotGlobal(t *testing.T) { + svc, _ := newAuthServiceForTest(t) + for i := 0; i < 5; i++ { + req := postJSON(t, loginRequest{AccessKey: "AKIA_ADMIN", SecretKey: "WRONG"}) + rec := httptest.NewRecorder() + svc.HandleLogin(rec, req) + require.Equal(t, http.StatusUnauthorized, rec.Code) + } + // Different IP — should not be throttled. + req := postJSON(t, loginRequest{AccessKey: "AKIA_ADMIN", SecretKey: "ADMIN_SECRET"}) + req.RemoteAddr = "10.0.0.1:12345" + rec := httptest.NewRecorder() + svc.HandleLogin(rec, req) + require.Equal(t, http.StatusOK, rec.Code) +} + +func TestLogin_InsecureCookieOptIn(t *testing.T) { + clk := fixedClock(time.Unix(1_700_000_000, 0).UTC()) + signer := newSignerForTest(t, 1, clk) + creds := MapCredentialStore{"AKIA_ADMIN": "ADMIN_SECRET"} + roles := map[string]Role{"AKIA_ADMIN": RoleFull} + svc := NewAuthService(signer, creds, roles, AuthServiceOpts{Clock: clk, InsecureCookie: true}) + + req := postJSON(t, loginRequest{AccessKey: "AKIA_ADMIN", SecretKey: "ADMIN_SECRET"}) + rec := httptest.NewRecorder() + svc.HandleLogin(rec, req) + require.Equal(t, http.StatusOK, rec.Code) + for _, c := range rec.Result().Cookies() { + require.Falsef(t, c.Secure, "cookie %s must not be Secure in dev mode", c.Name) + } +} + +func TestLogout_ExpiresBothCookies(t *testing.T) { + svc, _ := newAuthServiceForTest(t) + req := httptest.NewRequest(http.MethodPost, "/admin/api/v1/auth/logout", nil) + req.RemoteAddr = "127.0.0.1:1" + rec := httptest.NewRecorder() + svc.HandleLogout(rec, req) + + require.Equal(t, http.StatusNoContent, rec.Code) + var names []string + for _, c := range rec.Result().Cookies() { + require.Equal(t, -1, c.MaxAge) + require.Equal(t, "", c.Value) + names = append(names, c.Name) + } + require.ElementsMatch(t, []string{sessionCookieName, csrfCookieName}, names) +} + +func TestLogout_OnlyPOST(t *testing.T) { + svc, _ := newAuthServiceForTest(t) + req := httptest.NewRequest(http.MethodGet, "/admin/api/v1/auth/logout", nil) + req.RemoteAddr = "127.0.0.1:1" + rec := httptest.NewRecorder() + svc.HandleLogout(rec, req) + require.Equal(t, http.StatusMethodNotAllowed, rec.Code) +} + +func TestRateLimiter_ResetsAfterWindow(t *testing.T) { + now := time.Unix(1_700_000_000, 0).UTC() + clk := func() time.Time { return now } + rl := newRateLimiter(2, time.Minute, clk) + + require.True(t, rl.allow("1.1.1.1")) + require.True(t, rl.allow("1.1.1.1")) + require.False(t, rl.allow("1.1.1.1")) + + now = now.Add(61 * time.Second) + require.True(t, rl.allow("1.1.1.1")) +} + +func TestClientIP(t *testing.T) { + for _, tc := range []struct { + remote string + want string + }{ + {"127.0.0.1:12345", "127.0.0.1"}, + {"[::1]:12345", "::1"}, + {"no-port", "no-port"}, + } { + req := httptest.NewRequest(http.MethodGet, "/", nil) + req.RemoteAddr = tc.remote + require.Equal(t, tc.want, clientIP(req)) + } +} diff --git a/internal/admin/cluster_handler.go b/internal/admin/cluster_handler.go new file mode 100644 index 00000000..64e8357f --- /dev/null +++ b/internal/admin/cluster_handler.go @@ -0,0 +1,76 @@ +package admin + +import ( + "context" + "net/http" + "time" + + "github.com/goccy/go-json" +) + +// ClusterInfo is the lightweight snapshot the admin dashboard displays on +// its landing page. Everything here is cheap to assemble; we deliberately +// do not include per-shard key counts or byte statistics to keep the +// endpoint safe to poll. +type ClusterInfo struct { + NodeID string `json:"node_id"` + Version string `json:"version"` + Timestamp time.Time `json:"timestamp"` + Groups []GroupInfo `json:"groups"` +} + +// GroupInfo describes a single Raft group from the local node's point of +// view. LeaderID is the empty string during an election or when the node +// has not yet discovered the leader. +type GroupInfo struct { + GroupID uint64 `json:"group_id"` + LeaderID string `json:"leader_id"` + Members []string `json:"members"` + IsLeader bool `json:"is_leader"` +} + +// ClusterInfoSource is the small contract the cluster handler calls out +// to. Production wires this to a real Raft/engine view; tests use a stub. +type ClusterInfoSource interface { + Describe(ctx context.Context) (ClusterInfo, error) +} + +// ClusterInfoFunc is a convenience adapter for wiring a plain function +// without defining an interface implementation. +type ClusterInfoFunc func(ctx context.Context) (ClusterInfo, error) + +// Describe implements ClusterInfoSource. +func (f ClusterInfoFunc) Describe(ctx context.Context) (ClusterInfo, error) { + return f(ctx) +} + +// ClusterHandler serves GET /admin/api/v1/cluster. +type ClusterHandler struct { + source ClusterInfoSource +} + +// NewClusterHandler wires a source into the HTTP handler. +func NewClusterHandler(source ClusterInfoSource) *ClusterHandler { + return &ClusterHandler{source: source} +} + +// ServeHTTP renders the cluster snapshot as JSON. Errors from the source +// become 500s; the body never exposes internal state beyond an error code. +func (h *ClusterHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + writeJSONError(w, http.StatusMethodNotAllowed, "method_not_allowed", "only GET") + return + } + info, err := h.source.Describe(r.Context()) + if err != nil { + writeJSONError(w, http.StatusInternalServerError, "cluster_describe_failed", err.Error()) + return + } + if info.Timestamp.IsZero() { + info.Timestamp = time.Now().UTC() + } + w.Header().Set("Content-Type", "application/json; charset=utf-8") + w.Header().Set("Cache-Control", "no-store") + w.WriteHeader(http.StatusOK) + _ = json.NewEncoder(w).Encode(info) +} diff --git a/internal/admin/cluster_handler_test.go b/internal/admin/cluster_handler_test.go new file mode 100644 index 00000000..289fbdad --- /dev/null +++ b/internal/admin/cluster_handler_test.go @@ -0,0 +1,77 @@ +package admin + +import ( + "context" + "errors" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/goccy/go-json" + "github.com/stretchr/testify/require" +) + +func TestClusterHandler_HappyPath(t *testing.T) { + source := ClusterInfoFunc(func(_ context.Context) (ClusterInfo, error) { + return ClusterInfo{ + NodeID: "node-1", + Version: "0.42.0", + Groups: []GroupInfo{ + {GroupID: 1, LeaderID: "node-1", IsLeader: true, Members: []string{"node-1", "node-2", "node-3"}}, + {GroupID: 2, LeaderID: "node-2", Members: []string{"node-1", "node-2", "node-3"}}, + }, + }, nil + }) + h := NewClusterHandler(source) + req := httptest.NewRequest(http.MethodGet, "/admin/api/v1/cluster", nil) + rec := httptest.NewRecorder() + h.ServeHTTP(rec, req) + + require.Equal(t, http.StatusOK, rec.Code) + var got ClusterInfo + require.NoError(t, json.Unmarshal(rec.Body.Bytes(), &got)) + require.Equal(t, "node-1", got.NodeID) + require.Equal(t, "0.42.0", got.Version) + require.Len(t, got.Groups, 2) + require.True(t, got.Groups[0].IsLeader) + require.False(t, got.Timestamp.IsZero()) +} + +func TestClusterHandler_PreservesExplicitTimestamp(t *testing.T) { + ts := time.Date(2026, 4, 24, 10, 0, 0, 0, time.UTC) + source := ClusterInfoFunc(func(_ context.Context) (ClusterInfo, error) { + return ClusterInfo{NodeID: "n", Version: "v", Timestamp: ts}, nil + }) + h := NewClusterHandler(source) + req := httptest.NewRequest(http.MethodGet, "/admin/api/v1/cluster", nil) + rec := httptest.NewRecorder() + h.ServeHTTP(rec, req) + + var got ClusterInfo + require.NoError(t, json.Unmarshal(rec.Body.Bytes(), &got)) + require.Equal(t, ts, got.Timestamp.UTC()) +} + +func TestClusterHandler_SourceErrorReturns500(t *testing.T) { + source := ClusterInfoFunc(func(_ context.Context) (ClusterInfo, error) { + return ClusterInfo{}, errors.New("describe failed") + }) + h := NewClusterHandler(source) + req := httptest.NewRequest(http.MethodGet, "/admin/api/v1/cluster", nil) + rec := httptest.NewRecorder() + h.ServeHTTP(rec, req) + + require.Equal(t, http.StatusInternalServerError, rec.Code) + require.Contains(t, rec.Body.String(), "cluster_describe_failed") +} + +func TestClusterHandler_OnlyGET(t *testing.T) { + h := NewClusterHandler(ClusterInfoFunc(func(_ context.Context) (ClusterInfo, error) { + return ClusterInfo{}, nil + })) + req := httptest.NewRequest(http.MethodPost, "/admin/api/v1/cluster", nil) + rec := httptest.NewRecorder() + h.ServeHTTP(rec, req) + require.Equal(t, http.StatusMethodNotAllowed, rec.Code) +} diff --git a/internal/admin/config.go b/internal/admin/config.go new file mode 100644 index 00000000..6ec23f08 --- /dev/null +++ b/internal/admin/config.go @@ -0,0 +1,226 @@ +package admin + +import ( + "encoding/base64" + "net" + "strings" + + "github.com/cockroachdb/errors" +) + +const ( + // sessionSigningKeyLen is the required raw byte length for the admin + // JWT HS256 signing key. + sessionSigningKeyLen = 64 +) + +// Config captures everything the admin listener needs at startup. It mirrors +// the Section 7.1 table in docs/design/2026_04_24_proposed_admin_dashboard.md +// and intentionally uses plain Go fields rather than a config library so the +// existing flag-based wiring in main.go can hand values over without a new +// dependency. +type Config struct { + // Enabled toggles the admin listener. Default false. + Enabled bool + + // Listen is the host:port for the admin HTTP server. Default + // 127.0.0.1:8080 (loopback only). + Listen string + + // TLSCertFile / TLSKeyFile enable TLS when both are set. + TLSCertFile string + TLSKeyFile string + + // AllowPlaintextNonLoopback opts out of the TLS-on-non-loopback + // requirement. Refusing to honour it is the default. + AllowPlaintextNonLoopback bool + + // SessionSigningKey is the base64-encoded cluster-wide HS256 key. It + // must decode to exactly 64 bytes. + SessionSigningKey string + + // SessionSigningKeyPrevious is an optional base64-encoded previous + // signing key, used to verify tokens issued before a key rotation. + SessionSigningKeyPrevious string + + // ReadOnlyAccessKeys grants the GET subset of admin endpoints. + ReadOnlyAccessKeys []string + + // FullAccessKeys grants the full CRUD surface of admin endpoints. + FullAccessKeys []string + + // AllowInsecureDevCookie turns off the always-on Secure cookie + // attribute. Intended only for local plaintext development; it is off + // by default and the startup banner calls it out loudly. + AllowInsecureDevCookie bool +} + +// Validate returns the first configuration error found, if any. It does not +// try to collect every error because any of these conditions is a hard +// startup failure. +func (c *Config) Validate() error { + if c == nil { + return errors.New("admin config is nil") + } + if !c.Enabled { + return nil + } + if err := c.validateListen(); err != nil { + return err + } + if err := c.validateTLS(); err != nil { + return err + } + if err := c.validateSigningKeys(); err != nil { + return err + } + return validateAccessKeyRoles(c.ReadOnlyAccessKeys, c.FullAccessKeys) +} + +func (c *Config) validateListen() error { + listen := strings.TrimSpace(c.Listen) + if listen == "" { + return errors.New("admin.listen must not be empty when admin.enabled=true") + } + if _, _, err := net.SplitHostPort(listen); err != nil { + return errors.Wrapf(err, "admin.listen %q is not host:port", c.Listen) + } + return nil +} + +func (c *Config) validateTLS() error { + tlsConfigured := strings.TrimSpace(c.TLSCertFile) != "" && strings.TrimSpace(c.TLSKeyFile) != "" + if tlsConfigured || !addressRequiresTLS(strings.TrimSpace(c.Listen)) || c.AllowPlaintextNonLoopback { + return nil + } + return errors.WithStack(errors.Newf( + "admin.listen %q is not loopback but TLS is not configured;"+ + " set admin.tls.cert_file + admin.tls.key_file, or explicitly pass"+ + " --admin-allow-plaintext-non-loopback (strongly discouraged)", + c.Listen, + )) +} + +func (c *Config) validateSigningKeys() error { + primary, err := decodeSigningKey("admin.session_signing_key", c.SessionSigningKey) + if err != nil { + return err + } + if len(primary) == 0 { + return errors.New("admin.session_signing_key is required when admin.enabled=true") + } + if strings.TrimSpace(c.SessionSigningKeyPrevious) == "" { + return nil + } + if _, err := decodeSigningKey("admin.session_signing_key_previous", c.SessionSigningKeyPrevious); err != nil { + return err + } + return nil +} + +// DecodedSigningKeys returns the raw HS256 keys in verification order: the +// primary signing key first, followed by an optional previous key. Validate +// must be called first. +func (c *Config) DecodedSigningKeys() ([][]byte, error) { + primary, err := decodeSigningKey("admin.session_signing_key", c.SessionSigningKey) + if err != nil { + return nil, err + } + keys := [][]byte{primary} + if strings.TrimSpace(c.SessionSigningKeyPrevious) != "" { + prev, err := decodeSigningKey("admin.session_signing_key_previous", c.SessionSigningKeyPrevious) + if err != nil { + return nil, err + } + keys = append(keys, prev) + } + return keys, nil +} + +// RoleIndex returns a map from access key to Role after Validate has +// succeeded. The caller must not mutate the returned map. +func (c *Config) RoleIndex() map[string]Role { + index := make(map[string]Role, len(c.FullAccessKeys)+len(c.ReadOnlyAccessKeys)) + for _, k := range c.FullAccessKeys { + trim := strings.TrimSpace(k) + if trim == "" { + continue + } + index[trim] = RoleFull + } + for _, k := range c.ReadOnlyAccessKeys { + trim := strings.TrimSpace(k) + if trim == "" { + continue + } + // Overlap is rejected by Validate; this branch only runs for + // keys exclusive to read_only. + index[trim] = RoleReadOnly + } + return index +} + +func decodeSigningKey(field, encoded string) ([]byte, error) { + trim := strings.TrimSpace(encoded) + if trim == "" { + return nil, nil + } + raw, err := base64.StdEncoding.DecodeString(trim) + if err != nil { + raw, err = base64.RawStdEncoding.DecodeString(trim) + if err != nil { + return nil, errors.Wrapf(err, "%s is not valid base64", field) + } + } + if len(raw) != sessionSigningKeyLen { + return nil, errors.WithStack(errors.Newf( + "%s must decode to %d bytes but got %d bytes", + field, sessionSigningKeyLen, len(raw), + )) + } + return raw, nil +} + +func validateAccessKeyRoles(readOnly, full []string) error { + fullSet := make(map[string]struct{}, len(full)) + for _, k := range full { + trim := strings.TrimSpace(k) + if trim == "" { + continue + } + fullSet[trim] = struct{}{} + } + for _, k := range readOnly { + trim := strings.TrimSpace(k) + if trim == "" { + continue + } + if _, dup := fullSet[trim]; dup { + return errors.WithStack(errors.Newf( + "access key %q is listed in both admin.read_only_access_keys and admin.full_access_keys;"+ + " this would silently grant write access depending on lookup order, so it is rejected at startup", + trim, + )) + } + } + return nil +} + +// addressRequiresTLS reports whether a listen address is exposed beyond +// loopback and therefore must use TLS. Mirrors monitoring.AddressRequiresToken +// so the admin package does not import monitoring. +func addressRequiresTLS(addr string) bool { + host, _, err := net.SplitHostPort(strings.TrimSpace(addr)) + if err != nil { + return true + } + host = strings.TrimSpace(host) + if host == "" || host == "0.0.0.0" || host == "::" { + return true + } + if strings.EqualFold(host, "localhost") { + return false + } + ip := net.ParseIP(host) + return ip == nil || !ip.IsLoopback() +} diff --git a/internal/admin/config_test.go b/internal/admin/config_test.go new file mode 100644 index 00000000..82a64c61 --- /dev/null +++ b/internal/admin/config_test.go @@ -0,0 +1,203 @@ +package admin + +import ( + "encoding/base64" + "strings" + "testing" + + "github.com/stretchr/testify/require" +) + +func makeKey(seed byte) string { + raw := make([]byte, sessionSigningKeyLen) + for i := range raw { + raw[i] = seed + } + return base64.StdEncoding.EncodeToString(raw) +} + +func TestConfigValidate_DisabledNoOp(t *testing.T) { + c := &Config{} + require.NoError(t, c.Validate()) +} + +func TestConfigValidate_RequiresListen(t *testing.T) { + c := &Config{ + Enabled: true, + SessionSigningKey: makeKey(1), + } + err := c.Validate() + require.Error(t, err) + require.Contains(t, err.Error(), "admin.listen must not be empty") +} + +func TestConfigValidate_RequiresSigningKey(t *testing.T) { + c := &Config{ + Enabled: true, + Listen: "127.0.0.1:8080", + } + err := c.Validate() + require.Error(t, err) + require.Contains(t, err.Error(), "session_signing_key is required") +} + +func TestConfigValidate_SigningKeyWrongLength(t *testing.T) { + short := base64.StdEncoding.EncodeToString([]byte("too short")) + c := &Config{ + Enabled: true, + Listen: "127.0.0.1:8080", + SessionSigningKey: short, + } + err := c.Validate() + require.Error(t, err) + require.Contains(t, err.Error(), "must decode to 64 bytes") +} + +func TestConfigValidate_SigningKeyNotBase64(t *testing.T) { + c := &Config{ + Enabled: true, + Listen: "127.0.0.1:8080", + SessionSigningKey: "!!!not base64!!!", + } + err := c.Validate() + require.Error(t, err) + require.Contains(t, err.Error(), "not valid base64") +} + +func TestConfigValidate_LoopbackNoTLSOK(t *testing.T) { + for _, addr := range []string{"127.0.0.1:8080", "[::1]:8080", "localhost:8080"} { + c := &Config{ + Enabled: true, + Listen: addr, + SessionSigningKey: makeKey(7), + } + require.NoErrorf(t, c.Validate(), "loopback %s", addr) + } +} + +func TestConfigValidate_NonLoopbackRequiresTLS(t *testing.T) { + c := &Config{ + Enabled: true, + Listen: "0.0.0.0:8080", + SessionSigningKey: makeKey(2), + } + err := c.Validate() + require.Error(t, err) + require.Contains(t, err.Error(), "TLS is not configured") + require.Contains(t, err.Error(), "allow-plaintext-non-loopback") +} + +func TestConfigValidate_NonLoopbackWithTLSOK(t *testing.T) { + c := &Config{ + Enabled: true, + Listen: "10.0.0.1:8443", + TLSCertFile: "cert.pem", + TLSKeyFile: "key.pem", + SessionSigningKey: makeKey(3), + } + require.NoError(t, c.Validate()) +} + +func TestConfigValidate_NonLoopbackPlaintextOptInOK(t *testing.T) { + c := &Config{ + Enabled: true, + Listen: "0.0.0.0:8080", + AllowPlaintextNonLoopback: true, + SessionSigningKey: makeKey(4), + } + require.NoError(t, c.Validate()) +} + +func TestConfigValidate_OverlappingRolesRejected(t *testing.T) { + c := &Config{ + Enabled: true, + Listen: "127.0.0.1:8080", + SessionSigningKey: makeKey(5), + ReadOnlyAccessKeys: []string{"AKIA1", "AKIA2"}, + FullAccessKeys: []string{"AKIA2", "AKIA3"}, + } + err := c.Validate() + require.Error(t, err) + require.Contains(t, err.Error(), "AKIA2") + require.Contains(t, err.Error(), "both") +} + +func TestConfigValidate_RoleIndexExclusive(t *testing.T) { + c := &Config{ + Enabled: true, + Listen: "127.0.0.1:8080", + SessionSigningKey: makeKey(6), + ReadOnlyAccessKeys: []string{"AKIA_RO"}, + FullAccessKeys: []string{"AKIA_ADMIN"}, + } + require.NoError(t, c.Validate()) + idx := c.RoleIndex() + require.Equal(t, RoleReadOnly, idx["AKIA_RO"]) + require.Equal(t, RoleFull, idx["AKIA_ADMIN"]) + _, unknown := idx["AKIA_NOBODY"] + require.False(t, unknown) +} + +func TestConfigValidate_PreviousSigningKeyValidated(t *testing.T) { + c := &Config{ + Enabled: true, + Listen: "127.0.0.1:8080", + SessionSigningKey: makeKey(1), + SessionSigningKeyPrevious: base64.StdEncoding.EncodeToString([]byte("short")), + } + err := c.Validate() + require.Error(t, err) + require.Contains(t, err.Error(), "session_signing_key_previous") +} + +func TestConfigDecodedSigningKeys_Order(t *testing.T) { + c := &Config{ + Enabled: true, + Listen: "127.0.0.1:8080", + SessionSigningKey: makeKey(1), + SessionSigningKeyPrevious: makeKey(2), + } + require.NoError(t, c.Validate()) + keys, err := c.DecodedSigningKeys() + require.NoError(t, err) + require.Len(t, keys, 2) + require.Equal(t, byte(1), keys[0][0]) + require.Equal(t, byte(2), keys[1][0]) +} + +func TestRole_AllowsWrite(t *testing.T) { + require.True(t, RoleFull.AllowsWrite()) + require.False(t, RoleReadOnly.AllowsWrite()) + require.False(t, Role("").AllowsWrite()) +} + +func TestAddressRequiresTLS(t *testing.T) { + require.False(t, addressRequiresTLS("127.0.0.1:8080")) + require.False(t, addressRequiresTLS("[::1]:8080")) + require.False(t, addressRequiresTLS("localhost:8080")) + require.True(t, addressRequiresTLS(":8080")) + require.True(t, addressRequiresTLS("0.0.0.0:8080")) + require.True(t, addressRequiresTLS("10.0.0.1:8080")) + require.True(t, addressRequiresTLS("garbage")) +} + +func TestConfigValidate_DisabledIgnoresBadFields(t *testing.T) { + c := &Config{ + Enabled: false, + Listen: "not a host port", + } + require.NoError(t, c.Validate()) +} + +// TestConfigValidate_PreservesContext ensures the validation message +// includes the offending field so operators can resolve errors quickly. +func TestConfigValidate_PreservesContext(t *testing.T) { + c := &Config{ + Enabled: true, + Listen: "::::::::", + SessionSigningKey: makeKey(8), + } + err := c.Validate() + require.Error(t, err) + require.True(t, strings.Contains(err.Error(), "admin.listen")) +} diff --git a/internal/admin/jwt.go b/internal/admin/jwt.go new file mode 100644 index 00000000..c27a7ce5 --- /dev/null +++ b/internal/admin/jwt.go @@ -0,0 +1,208 @@ +package admin + +import ( + "crypto/hmac" + "crypto/rand" + "crypto/sha256" + "crypto/subtle" + "encoding/base64" + "strings" + "time" + + "github.com/cockroachdb/errors" + "github.com/goccy/go-json" +) + +// Session TTL for admin JWTs. Aligns with the 1h Max-Age specified for the +// session cookie in the design doc (Section 6.1). +const sessionTTL = 1 * time.Hour + +// jwtSegments is the fixed number of dot-separated segments in a valid +// HS256 JWT (header.payload.signature). +const jwtSegments = 3 + +// jwtHeader is the fixed HS256 JWT header. Admin never issues anything else. +var jwtHeaderEncoded = base64.RawURLEncoding.EncodeToString([]byte(`{"alg":"HS256","typ":"JWT"}`)) + +type jwtClaims struct { + Sub string `json:"sub"` + Role string `json:"role"` + IAT int64 `json:"iat"` + EXP int64 `json:"exp"` + JTI string `json:"jti"` +} + +// Clock is the small time abstraction used by the signer/verifier so tests +// can control token freshness without sleeping. +type Clock func() time.Time + +// SystemClock returns wall-clock time and is the default for production. +func SystemClock() time.Time { return time.Now().UTC() } + +// Signer issues HS256-signed JWTs using the primary admin signing key. Only +// the primary key can sign new tokens; the previous key is verify-only and +// lives on Verifier. +type Signer struct { + key []byte + clock Clock +} + +// NewSigner constructs a Signer; key must be exactly sessionSigningKeyLen +// bytes (validated up-front so we do not catch this inside the hot path). +func NewSigner(key []byte, clock Clock) (*Signer, error) { + if len(key) != sessionSigningKeyLen { + return nil, errors.WithStack(errors.Newf("signer key must be %d bytes, got %d", sessionSigningKeyLen, len(key))) + } + if clock == nil { + clock = SystemClock + } + copied := append([]byte{}, key...) + return &Signer{key: copied, clock: clock}, nil +} + +// Sign mints a fresh JWT for principal with the admin session TTL. +func (s *Signer) Sign(principal AuthPrincipal) (string, error) { + jti, err := randomJTI() + if err != nil { + return "", err + } + now := s.clock().UTC() + claims := jwtClaims{ + Sub: principal.AccessKey, + Role: string(principal.Role), + IAT: now.Unix(), + EXP: now.Add(sessionTTL).Unix(), + JTI: jti, + } + payload, err := json.Marshal(claims) + if err != nil { + return "", errors.Wrap(err, "marshal jwt claims") + } + encodedPayload := base64.RawURLEncoding.EncodeToString(payload) + signingInput := jwtHeaderEncoded + "." + encodedPayload + mac := hmac.New(sha256.New, s.key) + mac.Write([]byte(signingInput)) + sig := base64.RawURLEncoding.EncodeToString(mac.Sum(nil)) + return signingInput + "." + sig, nil +} + +// Verifier validates HS256 admin tokens. It tries the primary key first and +// falls back to the optional previous key so operators can rotate keys +// without logging everybody out at once. +type Verifier struct { + keys [][]byte + clock Clock +} + +// NewVerifier builds a verifier from keys in priority order (primary first, +// optional previous second). Zero-length keys are rejected. +func NewVerifier(keys [][]byte, clock Clock) (*Verifier, error) { + if len(keys) == 0 { + return nil, errors.New("verifier requires at least one key") + } + for i, k := range keys { + if len(k) != sessionSigningKeyLen { + return nil, errors.WithStack(errors.Newf("verifier key[%d] must be %d bytes, got %d", i, sessionSigningKeyLen, len(k))) + } + } + copied := make([][]byte, len(keys)) + for i, k := range keys { + copied[i] = append([]byte{}, k...) + } + if clock == nil { + clock = SystemClock + } + return &Verifier{keys: copied, clock: clock}, nil +} + +// ErrInvalidToken is returned for any verification failure without leaking +// which specific check failed. Callers should log the wrapped error but +// return a single 401 to clients regardless of the cause. +var ErrInvalidToken = errors.New("invalid admin session token") + +// Verify parses token, checks the signature against each configured key, +// and confirms it is within its validity window. On success it returns the +// embedded AuthPrincipal. +func (v *Verifier) Verify(token string) (AuthPrincipal, error) { + signingInput, payloadSeg, sig, err := splitSignedToken(token) + if err != nil { + return AuthPrincipal{}, err + } + if err := v.checkSignature(signingInput, sig); err != nil { + return AuthPrincipal{}, err + } + claims, err := decodeClaims(payloadSeg) + if err != nil { + return AuthPrincipal{}, err + } + return v.validateClaims(claims) +} + +// clockSkewToleranceSeconds is the slack we allow on the "issued in the +// future" check so that minor NTP drift between admin nodes does not +// produce spurious 401s. +const clockSkewToleranceSeconds = 30 + +func splitSignedToken(token string) (signingInput, payloadSeg string, sig []byte, err error) { + parts := strings.Split(token, ".") + if len(parts) != jwtSegments { + return "", "", nil, errors.Wrap(ErrInvalidToken, "token does not have three segments") + } + if parts[0] != jwtHeaderEncoded { + return "", "", nil, errors.Wrap(ErrInvalidToken, "unsupported jwt header") + } + sig, err = base64.RawURLEncoding.DecodeString(parts[2]) + if err != nil { + return "", "", nil, errors.Wrap(ErrInvalidToken, "malformed signature") + } + return parts[0] + "." + parts[1], parts[1], sig, nil +} + +func (v *Verifier) checkSignature(signingInput string, providedSig []byte) error { + for _, k := range v.keys { + mac := hmac.New(sha256.New, k) + mac.Write([]byte(signingInput)) + if subtle.ConstantTimeCompare(mac.Sum(nil), providedSig) == 1 { + return nil + } + } + return errors.Wrap(ErrInvalidToken, "signature mismatch") +} + +func decodeClaims(payloadSeg string) (jwtClaims, error) { + payload, err := base64.RawURLEncoding.DecodeString(payloadSeg) + if err != nil { + return jwtClaims{}, errors.Wrap(ErrInvalidToken, "malformed payload") + } + var claims jwtClaims + if err := json.Unmarshal(payload, &claims); err != nil { + return jwtClaims{}, errors.Wrap(ErrInvalidToken, "payload is not json") + } + return claims, nil +} + +func (v *Verifier) validateClaims(claims jwtClaims) (AuthPrincipal, error) { + now := v.clock().UTC().Unix() + if claims.EXP == 0 || now >= claims.EXP { + return AuthPrincipal{}, errors.Wrap(ErrInvalidToken, "token expired") + } + if claims.IAT != 0 && now+clockSkewToleranceSeconds < claims.IAT { + return AuthPrincipal{}, errors.Wrap(ErrInvalidToken, "token issued in the future") + } + if claims.Sub == "" { + return AuthPrincipal{}, errors.Wrap(ErrInvalidToken, "missing sub") + } + role := Role(claims.Role) + if role != RoleReadOnly && role != RoleFull { + return AuthPrincipal{}, errors.Wrap(ErrInvalidToken, "unknown role") + } + return AuthPrincipal{AccessKey: claims.Sub, Role: role}, nil +} + +func randomJTI() (string, error) { + var raw [16]byte + if _, err := rand.Read(raw[:]); err != nil { + return "", errors.Wrap(err, "read random bytes for jti") + } + return base64.RawURLEncoding.EncodeToString(raw[:]), nil +} diff --git a/internal/admin/jwt_test.go b/internal/admin/jwt_test.go new file mode 100644 index 00000000..b7301f4e --- /dev/null +++ b/internal/admin/jwt_test.go @@ -0,0 +1,184 @@ +package admin + +import ( + "bytes" + "crypto/hmac" + "crypto/sha256" + "encoding/base64" + "strings" + "testing" + "time" + + "github.com/cockroachdb/errors" + "github.com/stretchr/testify/require" +) + +func fixedClock(t time.Time) Clock { return func() time.Time { return t } } + +func newSignerForTest(t *testing.T, seed byte, clk Clock) *Signer { + t.Helper() + key := bytes.Repeat([]byte{seed}, sessionSigningKeyLen) + s, err := NewSigner(key, clk) + require.NoError(t, err) + return s +} + +func newVerifierForTest(t *testing.T, seeds []byte, clk Clock) *Verifier { + t.Helper() + keys := make([][]byte, 0, len(seeds)) + for _, seed := range seeds { + keys = append(keys, bytes.Repeat([]byte{seed}, sessionSigningKeyLen)) + } + v, err := NewVerifier(keys, clk) + require.NoError(t, err) + return v +} + +func TestJWT_SignVerifyRoundTrip(t *testing.T) { + clk := fixedClock(time.Unix(1_700_000_000, 0).UTC()) + signer := newSignerForTest(t, 1, clk) + verifier := newVerifierForTest(t, []byte{1}, clk) + + principal := AuthPrincipal{AccessKey: "AKIA_OK", Role: RoleFull} + token, err := signer.Sign(principal) + require.NoError(t, err) + require.Equal(t, 2, strings.Count(token, ".")) + + got, err := verifier.Verify(token) + require.NoError(t, err) + require.Equal(t, principal, got) +} + +func TestJWT_RejectsExpired(t *testing.T) { + signClk := fixedClock(time.Unix(1_700_000_000, 0).UTC()) + verifyClk := fixedClock(time.Unix(1_700_000_000+int64(sessionTTL.Seconds()+1), 0).UTC()) + signer := newSignerForTest(t, 1, signClk) + verifier := newVerifierForTest(t, []byte{1}, verifyClk) + + token, err := signer.Sign(AuthPrincipal{AccessKey: "AKIA", Role: RoleReadOnly}) + require.NoError(t, err) + + _, err = verifier.Verify(token) + require.Error(t, err) + require.True(t, errors.Is(err, ErrInvalidToken)) +} + +func TestJWT_RejectsFutureIssued(t *testing.T) { + // Sign in the future; verifier clock is now. + signClk := fixedClock(time.Unix(1_700_000_000+600, 0).UTC()) + verifyClk := fixedClock(time.Unix(1_700_000_000, 0).UTC()) + signer := newSignerForTest(t, 1, signClk) + verifier := newVerifierForTest(t, []byte{1}, verifyClk) + + token, err := signer.Sign(AuthPrincipal{AccessKey: "AKIA", Role: RoleFull}) + require.NoError(t, err) + + _, err = verifier.Verify(token) + require.Error(t, err) + require.True(t, errors.Is(err, ErrInvalidToken)) +} + +func TestJWT_RejectsWrongSignature(t *testing.T) { + clk := fixedClock(time.Unix(1_700_000_000, 0).UTC()) + signer := newSignerForTest(t, 1, clk) + verifier := newVerifierForTest(t, []byte{9}, clk) // different key + + token, err := signer.Sign(AuthPrincipal{AccessKey: "AKIA", Role: RoleFull}) + require.NoError(t, err) + + _, err = verifier.Verify(token) + require.Error(t, err) + require.True(t, errors.Is(err, ErrInvalidToken)) +} + +func TestJWT_PreviousKeyAccepted(t *testing.T) { + clk := fixedClock(time.Unix(1_700_000_000, 0).UTC()) + // Previous key signs the token. + oldSigner := newSignerForTest(t, 2, clk) + token, err := oldSigner.Sign(AuthPrincipal{AccessKey: "AKIA_OLD", Role: RoleReadOnly}) + require.NoError(t, err) + + // Verifier has primary=new, previous=old. + verifier := newVerifierForTest(t, []byte{1, 2}, clk) + got, err := verifier.Verify(token) + require.NoError(t, err) + require.Equal(t, "AKIA_OLD", got.AccessKey) + require.Equal(t, RoleReadOnly, got.Role) +} + +func TestJWT_AfterRotationOldPreviousRejected(t *testing.T) { + clk := fixedClock(time.Unix(1_700_000_000, 0).UTC()) + // Token minted with key seed=7. + signer := newSignerForTest(t, 7, clk) + token, err := signer.Sign(AuthPrincipal{AccessKey: "AKIA", Role: RoleFull}) + require.NoError(t, err) + + // After rotation completes, only seeds {1,2} are configured. + verifier := newVerifierForTest(t, []byte{1, 2}, clk) + _, err = verifier.Verify(token) + require.Error(t, err) + require.True(t, errors.Is(err, ErrInvalidToken)) +} + +func TestJWT_MalformedTokens(t *testing.T) { + clk := fixedClock(time.Unix(1_700_000_000, 0).UTC()) + verifier := newVerifierForTest(t, []byte{1}, clk) + + cases := []string{ + "", + "abc", + "a.b", + "a.b.c.d", + "not-header.payload.sig", + } + for _, tok := range cases { + _, err := verifier.Verify(tok) + require.Errorf(t, err, "token %q should fail", tok) + require.True(t, errors.Is(err, ErrInvalidToken)) + } +} + +func TestJWT_RejectsUnknownRole(t *testing.T) { + clk := fixedClock(time.Unix(1_700_000_000, 0).UTC()) + key := bytes.Repeat([]byte{3}, sessionSigningKeyLen) + // Manually craft a token with role=admin (unsupported). + payload := []byte(`{"sub":"AKIA","role":"admin","iat":1700000000,"exp":1700003600,"jti":"j"}`) + encodedPayload := base64.RawURLEncoding.EncodeToString(payload) + signingInput := jwtHeaderEncoded + "." + encodedPayload + mac := hmac.New(sha256.New, key) + mac.Write([]byte(signingInput)) + sig := base64.RawURLEncoding.EncodeToString(mac.Sum(nil)) + token := signingInput + "." + sig + + verifier := newVerifierForTest(t, []byte{3}, clk) + _, err := verifier.Verify(token) + require.Error(t, err) + require.True(t, errors.Is(err, ErrInvalidToken)) +} + +func TestJWT_RejectsMissingSub(t *testing.T) { + clk := fixedClock(time.Unix(1_700_000_000, 0).UTC()) + key := bytes.Repeat([]byte{4}, sessionSigningKeyLen) + payload := []byte(`{"sub":"","role":"full","iat":1700000000,"exp":1700003600,"jti":"j"}`) + encodedPayload := base64.RawURLEncoding.EncodeToString(payload) + signingInput := jwtHeaderEncoded + "." + encodedPayload + mac := hmac.New(sha256.New, key) + mac.Write([]byte(signingInput)) + sig := base64.RawURLEncoding.EncodeToString(mac.Sum(nil)) + token := signingInput + "." + sig + + verifier := newVerifierForTest(t, []byte{4}, clk) + _, err := verifier.Verify(token) + require.Error(t, err) + require.True(t, errors.Is(err, ErrInvalidToken)) +} + +func TestNewSigner_RejectsWrongKeyLength(t *testing.T) { + _, err := NewSigner([]byte("short"), nil) + require.Error(t, err) +} + +func TestNewVerifier_RejectsEmptyKeys(t *testing.T) { + _, err := NewVerifier(nil, nil) + require.Error(t, err) +} diff --git a/internal/admin/middleware.go b/internal/admin/middleware.go new file mode 100644 index 00000000..88dcfa50 --- /dev/null +++ b/internal/admin/middleware.go @@ -0,0 +1,240 @@ +package admin + +import ( + "context" + "errors" + "log/slog" + "net/http" + "strings" + "time" + + pkgerrors "github.com/cockroachdb/errors" +) + +// Cookie names used throughout the admin surface. We define them in one +// place so the login handler, CSRF middleware, and logout handler cannot +// drift. +const ( + sessionCookieName = "admin_session" + csrfCookieName = "admin_csrf" + csrfHeaderName = "X-Admin-CSRF" + + // defaultBodyLimit matches docs/design 4.4: 64 KiB upper bound for + // every POST/PUT endpoint. DynamoDB table schemas and S3 bucket + // metadata are each well under this bound. + defaultBodyLimit int64 = 64 << 10 +) + +// contextKey is the unexported type for storing values in request +// contexts. Using a string type directly would risk collisions with other +// packages. +type contextKey int + +const ( + ctxKeyPrincipal contextKey = iota + 1 +) + +// PrincipalFromContext returns the authenticated principal associated +// with the request context, or false if the middleware did not set one. +func PrincipalFromContext(ctx context.Context) (AuthPrincipal, bool) { + v, ok := ctx.Value(ctxKeyPrincipal).(AuthPrincipal) + return v, ok +} + +// BodyLimit wraps the request body with http.MaxBytesReader and responds +// 413 when the client exceeds the cap. It also sets +// http.MaxBytesError-aware error translation so the handler does not need +// to distinguish ordinary IO failures from overflow. +func BodyLimit(limit int64) func(http.Handler) http.Handler { + if limit <= 0 { + limit = defaultBodyLimit + } + return func(next http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.Body != nil { + r.Body = http.MaxBytesReader(w, r.Body, limit) + } + next.ServeHTTP(bodyLimitResponseWriter{ResponseWriter: w}, r) + }) + } +} + +// bodyLimitResponseWriter is a minor adapter that lets a handler translate +// its own MaxBytesError into a consistent 413 without duplicating the +// plumbing. At the time of writing, each write handler can call +// r.ParseForm / json.Decode and on error call +// `if errors.As(err, &http.MaxBytesError{}) { ... }` manually; this +// wrapper just forces the header once per request. +type bodyLimitResponseWriter struct { + http.ResponseWriter +} + +// WriteMaxBytesError is called by handlers that detected a MaxBytesError. +// It is a package-level helper rather than a method so the router error +// path keeps the same JSON shape as the rest. +func WriteMaxBytesError(w http.ResponseWriter) { + writeJSONError(w, http.StatusRequestEntityTooLarge, "payload_too_large", + "request body exceeds the 64 KiB admin limit") +} + +// IsMaxBytesError reports whether err was produced because the client +// uploaded more bytes than BodyLimit permits. +func IsMaxBytesError(err error) bool { + if err == nil { + return false + } + var mbe *http.MaxBytesError + return errors.As(err, &mbe) +} + +// SessionAuth parses the admin_session cookie, validates it against the +// verifier, and attaches the resulting AuthPrincipal to the request +// context. Requests without a session, or with an invalid/expired one, +// are rejected with 401. +func SessionAuth(verifier *Verifier) func(http.Handler) http.Handler { + return func(next http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + cookie, err := r.Cookie(sessionCookieName) + if err != nil || strings.TrimSpace(cookie.Value) == "" { + writeJSONError(w, http.StatusUnauthorized, "unauthenticated", "missing session cookie") + return + } + principal, err := verifier.Verify(cookie.Value) + if err != nil { + writeJSONError(w, http.StatusUnauthorized, "unauthenticated", "invalid or expired session") + return + } + ctx := context.WithValue(r.Context(), ctxKeyPrincipal, principal) + next.ServeHTTP(w, r.WithContext(ctx)) + }) + } +} + +// RequireWriteRole blocks the handler unless the current principal may +// execute write operations. Must be composed after SessionAuth. +func RequireWriteRole() func(http.Handler) http.Handler { + return func(next http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + principal, ok := PrincipalFromContext(r.Context()) + if !ok { + writeJSONError(w, http.StatusUnauthorized, "unauthenticated", "no principal on context") + return + } + if !principal.Role.AllowsWrite() { + writeJSONError(w, http.StatusForbidden, "forbidden", + "this endpoint requires admin.full_access_keys membership") + return + } + next.ServeHTTP(w, r) + }) + } +} + +// CSRFDoubleSubmit enforces the double-submit cookie rule for state +// changing methods (POST, PUT, DELETE, PATCH). The admin_csrf cookie is +// minted at login; the SPA copies its value into the X-Admin-CSRF header +// on every write. We reject the request if either the cookie or the +// header is missing or if they do not match. GET/HEAD pass through +// untouched. +func CSRFDoubleSubmit() func(http.Handler) http.Handler { + return func(next http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch r.Method { + case http.MethodGet, http.MethodHead, http.MethodOptions: + next.ServeHTTP(w, r) + return + } + cookie, err := r.Cookie(csrfCookieName) + if err != nil || strings.TrimSpace(cookie.Value) == "" { + writeJSONError(w, http.StatusForbidden, "csrf_missing", "admin_csrf cookie is required") + return + } + header := strings.TrimSpace(r.Header.Get(csrfHeaderName)) + if header == "" { + writeJSONError(w, http.StatusForbidden, "csrf_missing", + "X-Admin-CSRF header is required for write operations") + return + } + // Constant-time comparison: the values are user-supplied + // and we do not want to leak length differences. + if !constantTimeEq(cookie.Value, header) { + writeJSONError(w, http.StatusForbidden, "csrf_mismatch", "CSRF token mismatch") + return + } + next.ServeHTTP(w, r) + }) + } +} + +func constantTimeEq(a, b string) bool { + if len(a) != len(b) { + return false + } + var diff byte + for i := 0; i < len(a); i++ { + diff |= a[i] ^ b[i] + } + return diff == 0 +} + +// auditRecorder is the ResponseWriter wrapper the Audit middleware uses +// to learn the final status code without forcing the handler to pass it +// back explicitly. +type auditRecorder struct { + http.ResponseWriter + status int + written bool +} + +func (a *auditRecorder) WriteHeader(code int) { + if !a.written { + a.status = code + a.written = true + } + a.ResponseWriter.WriteHeader(code) +} + +func (a *auditRecorder) Write(b []byte) (int, error) { + if !a.written { + a.status = http.StatusOK + a.written = true + } + n, err := a.ResponseWriter.Write(b) + if err != nil { + return n, pkgerrors.Wrap(err, "audit recorder write") + } + return n, nil +} + +// Audit writes a structured slog line for every state-changing admin +// request, as required by docs/design Section 10. GET/HEAD requests are +// not audited (read traffic can be too loud and does not modify state). +// The logger uses the "admin_audit" key so operators can filter. Callers +// wire this middleware after SessionAuth so the principal is available. +func Audit(logger *slog.Logger) func(http.Handler) http.Handler { + if logger == nil { + logger = slog.Default() + } + return func(next http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch r.Method { + case http.MethodGet, http.MethodHead, http.MethodOptions: + next.ServeHTTP(w, r) + return + } + rec := &auditRecorder{ResponseWriter: w} + start := time.Now() + next.ServeHTTP(rec, r) + principal, _ := PrincipalFromContext(r.Context()) + logger.LogAttrs(r.Context(), slog.LevelInfo, "admin_audit", + slog.String("actor", principal.AccessKey), + slog.String("role", string(principal.Role)), + slog.String("method", r.Method), + slog.String("path", r.URL.Path), + slog.Int("status", rec.status), + slog.String("remote", r.RemoteAddr), + slog.Duration("duration", time.Since(start)), + ) + }) + } +} diff --git a/internal/admin/middleware_test.go b/internal/admin/middleware_test.go new file mode 100644 index 00000000..6ff6cc75 --- /dev/null +++ b/internal/admin/middleware_test.go @@ -0,0 +1,238 @@ +package admin + +import ( + "bytes" + "context" + "io" + "log/slog" + "net/http" + "net/http/httptest" + "strings" + "testing" + "time" + + "github.com/stretchr/testify/require" +) + +func TestBodyLimit_Allows(t *testing.T) { + var gotBody []byte + h := BodyLimit(128)(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + b, err := io.ReadAll(r.Body) + require.NoError(t, err) + gotBody = b + w.WriteHeader(http.StatusNoContent) + })) + req := httptest.NewRequest(http.MethodPost, "/admin/api/v1/x", strings.NewReader("hello")) + rec := httptest.NewRecorder() + h.ServeHTTP(rec, req) + require.Equal(t, http.StatusNoContent, rec.Code) + require.Equal(t, "hello", string(gotBody)) +} + +func TestBodyLimit_Exceeded(t *testing.T) { + oversize := strings.Repeat("x", 200) + h := BodyLimit(64)(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + _, err := io.ReadAll(r.Body) + if err != nil { + if IsMaxBytesError(err) { + WriteMaxBytesError(w) + return + } + t.Fatalf("unexpected error: %v", err) + } + w.WriteHeader(http.StatusNoContent) + })) + req := httptest.NewRequest(http.MethodPost, "/admin/api/v1/x", strings.NewReader(oversize)) + rec := httptest.NewRecorder() + h.ServeHTTP(rec, req) + require.Equal(t, http.StatusRequestEntityTooLarge, rec.Code) + require.Contains(t, rec.Body.String(), "payload_too_large") +} + +func TestSessionAuth_MissingCookie(t *testing.T) { + clk := fixedClock(time.Unix(1_700_000_000, 0).UTC()) + v := newVerifierForTest(t, []byte{1}, clk) + h := SessionAuth(v)(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusOK) + })) + + req := httptest.NewRequest(http.MethodGet, "/admin/api/v1/cluster", nil) + rec := httptest.NewRecorder() + h.ServeHTTP(rec, req) + require.Equal(t, http.StatusUnauthorized, rec.Code) + require.Contains(t, rec.Body.String(), "unauthenticated") +} + +func TestSessionAuth_HappyPathPutsPrincipalOnContext(t *testing.T) { + clk := fixedClock(time.Unix(1_700_000_000, 0).UTC()) + signer := newSignerForTest(t, 1, clk) + v := newVerifierForTest(t, []byte{1}, clk) + + principal := AuthPrincipal{AccessKey: "AKIA", Role: RoleFull} + token, err := signer.Sign(principal) + require.NoError(t, err) + + var gotPrincipal AuthPrincipal + h := SessionAuth(v)(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + p, ok := PrincipalFromContext(r.Context()) + require.True(t, ok) + gotPrincipal = p + w.WriteHeader(http.StatusOK) + })) + req := httptest.NewRequest(http.MethodGet, "/admin/api/v1/cluster", nil) + req.AddCookie(&http.Cookie{Name: sessionCookieName, Value: token}) + rec := httptest.NewRecorder() + h.ServeHTTP(rec, req) + + require.Equal(t, http.StatusOK, rec.Code) + require.Equal(t, principal, gotPrincipal) +} + +func TestSessionAuth_InvalidToken(t *testing.T) { + clk := fixedClock(time.Unix(1_700_000_000, 0).UTC()) + v := newVerifierForTest(t, []byte{1}, clk) + h := SessionAuth(v)(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusOK) + })) + req := httptest.NewRequest(http.MethodGet, "/admin/api/v1/cluster", nil) + req.AddCookie(&http.Cookie{Name: sessionCookieName, Value: "garbage"}) + rec := httptest.NewRecorder() + h.ServeHTTP(rec, req) + require.Equal(t, http.StatusUnauthorized, rec.Code) +} + +func TestRequireWriteRole_ReadOnlyRejected(t *testing.T) { + h := RequireWriteRole()(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusOK) + })) + ctx := context.WithValue(context.Background(), ctxKeyPrincipal, + AuthPrincipal{AccessKey: "AKIA_RO", Role: RoleReadOnly}) + req := httptest.NewRequest(http.MethodPost, "/admin/api/v1/dynamo/tables", nil).WithContext(ctx) + rec := httptest.NewRecorder() + h.ServeHTTP(rec, req) + require.Equal(t, http.StatusForbidden, rec.Code) + require.Contains(t, rec.Body.String(), "forbidden") +} + +func TestRequireWriteRole_FullAllowed(t *testing.T) { + h := RequireWriteRole()(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusCreated) + })) + ctx := context.WithValue(context.Background(), ctxKeyPrincipal, + AuthPrincipal{AccessKey: "AKIA_ADMIN", Role: RoleFull}) + req := httptest.NewRequest(http.MethodPost, "/admin/api/v1/dynamo/tables", nil).WithContext(ctx) + rec := httptest.NewRecorder() + h.ServeHTTP(rec, req) + require.Equal(t, http.StatusCreated, rec.Code) +} + +func TestRequireWriteRole_NoPrincipal(t *testing.T) { + h := RequireWriteRole()(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusOK) + })) + req := httptest.NewRequest(http.MethodPost, "/admin/api/v1/dynamo/tables", nil) + rec := httptest.NewRecorder() + h.ServeHTTP(rec, req) + require.Equal(t, http.StatusUnauthorized, rec.Code) +} + +func TestCSRF_GETPasses(t *testing.T) { + called := false + h := CSRFDoubleSubmit()(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + called = true + w.WriteHeader(http.StatusOK) + })) + req := httptest.NewRequest(http.MethodGet, "/admin/api/v1/dynamo/tables", nil) + rec := httptest.NewRecorder() + h.ServeHTTP(rec, req) + require.True(t, called) + require.Equal(t, http.StatusOK, rec.Code) +} + +func TestCSRF_WriteMissingCookie(t *testing.T) { + h := CSRFDoubleSubmit()(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusOK) + })) + req := httptest.NewRequest(http.MethodPost, "/admin/api/v1/dynamo/tables", nil) + req.Header.Set(csrfHeaderName, "tok") + rec := httptest.NewRecorder() + h.ServeHTTP(rec, req) + require.Equal(t, http.StatusForbidden, rec.Code) + require.Contains(t, rec.Body.String(), "csrf_missing") +} + +func TestCSRF_WriteMissingHeader(t *testing.T) { + h := CSRFDoubleSubmit()(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusOK) + })) + req := httptest.NewRequest(http.MethodPost, "/admin/api/v1/dynamo/tables", nil) + req.AddCookie(&http.Cookie{Name: csrfCookieName, Value: "tok"}) + rec := httptest.NewRecorder() + h.ServeHTTP(rec, req) + require.Equal(t, http.StatusForbidden, rec.Code) + require.Contains(t, rec.Body.String(), "csrf_missing") +} + +func TestCSRF_Mismatch(t *testing.T) { + h := CSRFDoubleSubmit()(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusOK) + })) + req := httptest.NewRequest(http.MethodPost, "/admin/api/v1/dynamo/tables", nil) + req.AddCookie(&http.Cookie{Name: csrfCookieName, Value: "a"}) + req.Header.Set(csrfHeaderName, "b") + rec := httptest.NewRecorder() + h.ServeHTTP(rec, req) + require.Equal(t, http.StatusForbidden, rec.Code) + require.Contains(t, rec.Body.String(), "csrf_mismatch") +} + +func TestCSRF_MatchAllows(t *testing.T) { + h := CSRFDoubleSubmit()(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusAccepted) + })) + req := httptest.NewRequest(http.MethodPost, "/admin/api/v1/dynamo/tables", nil) + req.AddCookie(&http.Cookie{Name: csrfCookieName, Value: "same"}) + req.Header.Set(csrfHeaderName, "same") + rec := httptest.NewRecorder() + h.ServeHTTP(rec, req) + require.Equal(t, http.StatusAccepted, rec.Code) +} + +func TestAudit_LogsWriteRequest(t *testing.T) { + var buf bytes.Buffer + logger := slog.New(slog.NewJSONHandler(&buf, &slog.HandlerOptions{Level: slog.LevelInfo})) + ctx := context.WithValue(context.Background(), ctxKeyPrincipal, + AuthPrincipal{AccessKey: "AKIA_ADMIN", Role: RoleFull}) + h := Audit(logger)(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusNoContent) + })) + req := httptest.NewRequest(http.MethodPost, "/admin/api/v1/dynamo/tables", nil).WithContext(ctx) + rec := httptest.NewRecorder() + h.ServeHTTP(rec, req) + require.Equal(t, http.StatusNoContent, rec.Code) + out := buf.String() + require.Contains(t, out, `"msg":"admin_audit"`) + require.Contains(t, out, `"actor":"AKIA_ADMIN"`) + require.Contains(t, out, `"method":"POST"`) + require.Contains(t, out, `"status":204`) +} + +func TestAudit_SkipsReads(t *testing.T) { + var buf bytes.Buffer + logger := slog.New(slog.NewJSONHandler(&buf, &slog.HandlerOptions{Level: slog.LevelInfo})) + h := Audit(logger)(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusOK) + })) + req := httptest.NewRequest(http.MethodGet, "/admin/api/v1/dynamo/tables", nil) + rec := httptest.NewRecorder() + h.ServeHTTP(rec, req) + require.Equal(t, http.StatusOK, rec.Code) + require.Empty(t, buf.String()) +} + +func TestConstantTimeEq(t *testing.T) { + require.True(t, constantTimeEq("abc", "abc")) + require.False(t, constantTimeEq("abc", "abd")) + require.False(t, constantTimeEq("abc", "abcd")) + require.True(t, constantTimeEq("", "")) +} diff --git a/internal/admin/principal.go b/internal/admin/principal.go new file mode 100644 index 00000000..883bac79 --- /dev/null +++ b/internal/admin/principal.go @@ -0,0 +1,28 @@ +package admin + +// Role represents the authorization tier of an authenticated admin session. +type Role string + +const ( + // RoleReadOnly permits only GET endpoints. + RoleReadOnly Role = "read_only" + // RoleFull permits the entire admin CRUD surface. + RoleFull Role = "full" +) + +// AllowsWrite reports whether the role may execute state-mutating operations. +func (r Role) AllowsWrite() bool { + return r == RoleFull +} + +// AuthPrincipal is the authenticated caller derived from a session cookie or, +// in the future, a follower→leader forwarded request. The admin handler and +// adapter internal entrypoints pass it around instead of a raw HTTP request +// so the authorization model stays consistent whether the request arrived +// via SigV4 or JWT. +type AuthPrincipal struct { + // AccessKey is the caller's SigV4 access key identifier. + AccessKey string + // Role is the role resolved from the server-side access key table. + Role Role +} diff --git a/internal/admin/ratelimit.go b/internal/admin/ratelimit.go new file mode 100644 index 00000000..126735f8 --- /dev/null +++ b/internal/admin/ratelimit.go @@ -0,0 +1,89 @@ +package admin + +import ( + "net" + "net/http" + "sync" + "time" +) + +// rateLimiterMaxEntries is the point at which the limiter performs a +// sweep of stale entries. Tuned so a reasonable burst of distinct +// source IPs does not force a pathological cleanup on every login. +const rateLimiterMaxEntries = 1024 + +// rateLimiter is a fixed-window, in-memory per-IP rate limiter. It is +// intentionally simple: the admin login endpoint is low-volume, we only +// need to slow brute-force guessing, and distributed accounting would +// require Raft round-trips per login (unreasonable for the threat model). +// Entries older than window are pruned lazily on the next hit. +type rateLimiter struct { + limit int + window time.Duration + clock Clock + + mu sync.Mutex + entries map[string]*rateLimiterEntry +} + +type rateLimiterEntry struct { + windowStart time.Time + count int +} + +func newRateLimiter(limit int, window time.Duration, clock Clock) *rateLimiter { + if clock == nil { + clock = SystemClock + } + return &rateLimiter{ + limit: limit, + window: window, + clock: clock, + entries: make(map[string]*rateLimiterEntry), + } +} + +// allow returns true if the client at ip may perform one more action +// within the current window, otherwise false. It is safe for concurrent +// use. +func (rl *rateLimiter) allow(ip string) bool { + now := rl.clock().UTC() + rl.mu.Lock() + defer rl.mu.Unlock() + + // Opportunistic cleanup: if the map ever exceeds a threshold, walk + // entries and drop stale windows. Keeps memory bounded against a + // spray of distinct source IPs. + if len(rl.entries) > rateLimiterMaxEntries { + for k, v := range rl.entries { + if now.Sub(v.windowStart) > rl.window { + delete(rl.entries, k) + } + } + } + + e, ok := rl.entries[ip] + if !ok || now.Sub(e.windowStart) >= rl.window { + rl.entries[ip] = &rateLimiterEntry{windowStart: now, count: 1} + return true + } + if e.count >= rl.limit { + return false + } + e.count++ + return true +} + +// clientIP extracts the IP part of the request's remote address. It falls +// back to the full RemoteAddr if SplitHostPort fails. We do not consult +// X-Forwarded-For here because the admin listener is expected to run +// directly on the node (loopback or behind a trusted load balancer in +// the TLS case); honouring client-controlled headers would let an +// attacker evade the limiter. +func clientIP(r *http.Request) string { + host, _, err := net.SplitHostPort(r.RemoteAddr) + if err != nil { + return r.RemoteAddr + } + return host +} diff --git a/internal/admin/router.go b/internal/admin/router.go new file mode 100644 index 00000000..56b1e470 --- /dev/null +++ b/internal/admin/router.go @@ -0,0 +1,186 @@ +package admin + +import ( + "errors" + "io/fs" + "net/http" + "path" + "strings" + + "github.com/goccy/go-json" +) + +// Constants for the admin URL namespace. Centralised here so the router, +// handlers, and tests all agree on the paths. The admin listener only +// serves URLs under /admin/*; anything else yields a 404. +const ( + pathPrefixAdmin = "/admin" + pathPrefixAPIv1 = "/admin/api/v1/" + pathHealthz = "/admin/healthz" + pathPrefixAssets = "/admin/assets/" + pathRootAssetsDir = "assets" + pathIndexHTML = "index.html" +) + +// APIHandler is the bridge between the router and all JSON API endpoints. +// Everything under /admin/api/v1/ resolves through it; individual endpoint +// routing is the handler's responsibility (see apiMux below). +type APIHandler http.Handler + +// Router dispatches admin HTTP requests in the strict order mandated by +// the design doc (Section 5.3): API routes first, then healthz, then +// static assets, then SPA fallback. We do NOT use http.ServeMux because +// its LongestPrefix matching rules would let /admin/api/v1/... slip into +// the SPA catch-all when the JSON handler returns a 404. +type Router struct { + api http.Handler + static fs.FS + notFind http.Handler +} + +// NewRouter builds the admin router. +// +// - api handles /admin/api/v1/*. It must return a JSON body itself; the +// router never rewrites its response. +// - static, if non-nil, backs both /admin/assets/* and the /admin/* +// SPA catch-all (which always serves index.html). A nil static FS +// causes 404s for asset and SPA routes, which is the expected state +// while the SPA has not been built yet. +func NewRouter(api http.Handler, static fs.FS) *Router { + return &Router{ + api: api, + static: static, + notFind: http.HandlerFunc(writeJSONNotFound), + } +} + +// ServeHTTP is the single entrypoint. Routing cascade in priority order: +// 1. /admin/api/v1/* → API handler +// 2. /admin/healthz → plain text +// 3. /admin/assets/* → static file +// 4. /admin/* → index.html (SPA fallback) +// 5. anything else → 404 JSON +func (rt *Router) ServeHTTP(w http.ResponseWriter, r *http.Request) { + p := r.URL.Path + + switch { + case strings.HasPrefix(p, pathPrefixAPIv1): + if rt.api == nil { + rt.notFind.ServeHTTP(w, r) + return + } + rt.api.ServeHTTP(w, r) + return + case p == pathHealthz: + rt.serveHealth(w, r) + return + case strings.HasPrefix(p, pathPrefixAssets): + rt.serveAsset(w, r) + return + case p == pathPrefixAdmin || strings.HasPrefix(p, pathPrefixAdmin+"/"): + rt.serveSPA(w, r) + return + } + rt.notFind.ServeHTTP(w, r) +} + +func (rt *Router) serveHealth(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet && r.Method != http.MethodHead { + writeJSONError(w, http.StatusMethodNotAllowed, "method_not_allowed", "only GET or HEAD supported") + return + } + w.Header().Set("Content-Type", "text/plain; charset=utf-8") + w.Header().Set("Cache-Control", "no-store") + w.WriteHeader(http.StatusOK) + if r.Method == http.MethodGet { + _, _ = w.Write([]byte("ok\n")) + } +} + +func (rt *Router) serveAsset(w http.ResponseWriter, r *http.Request) { + if rt.static == nil { + rt.notFind.ServeHTTP(w, r) + return + } + // Drop /admin/assets/ prefix → relative path under pathRootAssetsDir. + rel := strings.TrimPrefix(r.URL.Path, pathPrefixAssets) + if rel == "" || strings.Contains(rel, "..") { + rt.notFind.ServeHTTP(w, r) + return + } + name := path.Join(pathRootAssetsDir, rel) + f, err := rt.static.Open(name) + if err != nil { + if errors.Is(err, fs.ErrNotExist) { + rt.notFind.ServeHTTP(w, r) + return + } + writeJSONError(w, http.StatusInternalServerError, "internal", "failed to open asset") + return + } + defer f.Close() + info, err := f.Stat() + if err != nil || info.IsDir() { + rt.notFind.ServeHTTP(w, r) + return + } + readSeeker, ok := f.(interface { + Read(p []byte) (int, error) + Seek(offset int64, whence int) (int64, error) + }) + if !ok { + // embed.FS files implement ReadSeeker, but be defensive. + writeJSONError(w, http.StatusInternalServerError, "internal", "asset is not seekable") + return + } + http.ServeContent(w, r, name, info.ModTime(), readSeeker) +} + +func (rt *Router) serveSPA(w http.ResponseWriter, r *http.Request) { + if rt.static == nil { + rt.notFind.ServeHTTP(w, r) + return + } + if r.Method != http.MethodGet && r.Method != http.MethodHead { + writeJSONError(w, http.StatusMethodNotAllowed, "method_not_allowed", "only GET or HEAD supported") + return + } + f, err := rt.static.Open(pathIndexHTML) + if err != nil { + rt.notFind.ServeHTTP(w, r) + return + } + defer f.Close() + info, err := f.Stat() + if err != nil || info.IsDir() { + rt.notFind.ServeHTTP(w, r) + return + } + readSeeker, ok := f.(interface { + Read(p []byte) (int, error) + Seek(offset int64, whence int) (int64, error) + }) + if !ok { + writeJSONError(w, http.StatusInternalServerError, "internal", "index.html is not seekable") + return + } + w.Header().Set("Cache-Control", "no-store") + http.ServeContent(w, r, pathIndexHTML, info.ModTime(), readSeeker) +} + +// errorResponse is the JSON shape for every admin error. +type errorResponse struct { + Error string `json:"error"` + Message string `json:"message,omitempty"` +} + +func writeJSONNotFound(w http.ResponseWriter, _ *http.Request) { + writeJSONError(w, http.StatusNotFound, "not_found", "") +} + +func writeJSONError(w http.ResponseWriter, status int, code, msg string) { + w.Header().Set("Content-Type", "application/json; charset=utf-8") + w.Header().Set("Cache-Control", "no-store") + w.WriteHeader(status) + _ = json.NewEncoder(w).Encode(errorResponse{Error: code, Message: msg}) +} diff --git a/internal/admin/router_test.go b/internal/admin/router_test.go new file mode 100644 index 00000000..9554f40e --- /dev/null +++ b/internal/admin/router_test.go @@ -0,0 +1,142 @@ +package admin + +import ( + "io" + "net/http" + "net/http/httptest" + "strings" + "testing" + "testing/fstest" + + "github.com/stretchr/testify/require" +) + +func newTestStatic() fstest.MapFS { + return fstest.MapFS{ + "index.html": {Data: []byte("spa")}, + "assets/app.js": {Data: []byte("console.log('ok');")}, + "assets/app.css": {Data: []byte("body { color: red; }")}, + } +} + +func TestRouter_APIPathIsNeverSwallowedBySPA(t *testing.T) { + api := http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.Header().Set("X-Came-From", "api") + writeJSONError(w, http.StatusNotFound, "unknown_endpoint", "no handler") + }) + r := NewRouter(api, newTestStatic()) + + req := httptest.NewRequest(http.MethodGet, "/admin/api/v1/unknown", nil) + rec := httptest.NewRecorder() + r.ServeHTTP(rec, req) + + require.Equal(t, http.StatusNotFound, rec.Code) + require.Equal(t, "api", rec.Header().Get("X-Came-From")) + require.Contains(t, rec.Header().Get("Content-Type"), "application/json") + require.NotContains(t, rec.Body.String(), " Date: Sat, 25 Apr 2026 01:16:38 +0900 Subject: [PATCH 02/15] admin: address first-round review (security + robustness) Addresses the 7 findings on PR #623 from gemini-code-assist and chatgpt-codex: - cluster_handler: stop leaking the raft/store error text to clients; log the full error via slog and return a generic "cluster_describe_failed" body. Test updated to assert the sentinel string does not appear in the response. - auth_handler credential compare: hash both sides with SHA-256 before subtle.ConstantTimeCompare so length-based timing side channels cannot leak the secret size. Keep the unknown-access-key timing parity with a fixed placeholder digest. - auth_handler input handling: stop trimming SecretKey. Trimming was accepting inputs the SigV4 data-plane adapter would reject, creating a cross-protocol inconsistency; AccessKey (identifier) still trims. - auth_handler audit: login/logout now emit their own admin_audit slog entries. The generic Audit middleware cannot see the claimed actor at that point in the chain, so the Audit wrapper is dropped from publicAuth and replaced with explicit logging that captures actor, claimed_actor, remote, and status. Logout best-effort decodes the session cookie via the Verifier so the audit line can name the caller even when the session has not yet been revoked. - ratelimit: enforce a hard cap on distinct tracked IPs. Once the cap is reached we sweep expired windows first; if that still leaves the map full we refuse new entries instead of silently evicting a legitimate user in-progress state. Adds dedicated tests for the cap and window-reclamation paths. - main_admin: do not load --s3CredentialsFile when admin.enabled is false. A bad credentials file should not abort process startup when the admin listener is disabled. Tests: new auth_audit_test.go covers actor recording on login success / failure and logout with and without a session cookie, plus a same-length/different-length timing smoke check. golangci-lint ./... --new-from-rev=origin/main --timeout=5m: 0 issues. go test -race ./internal/admin/... . green. --- internal/admin/auth_audit_test.go | 120 +++++++++++++++++++ internal/admin/auth_handler.go | 156 +++++++++++++++++++++---- internal/admin/cluster_handler.go | 28 ++++- internal/admin/cluster_handler_test.go | 5 +- internal/admin/ratelimit.go | 60 +++++++--- internal/admin/ratelimit_test.go | 45 +++++++ internal/admin/server.go | 31 +++-- main_admin.go | 8 ++ 8 files changed, 396 insertions(+), 57 deletions(-) create mode 100644 internal/admin/auth_audit_test.go create mode 100644 internal/admin/ratelimit_test.go diff --git a/internal/admin/auth_audit_test.go b/internal/admin/auth_audit_test.go new file mode 100644 index 00000000..d36308d5 --- /dev/null +++ b/internal/admin/auth_audit_test.go @@ -0,0 +1,120 @@ +package admin + +import ( + "bytes" + "log/slog" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/stretchr/testify/require" +) + +func newAuthServiceWithAudit(t *testing.T) (*AuthService, *bytes.Buffer) { + t.Helper() + clk := fixedClock(time.Unix(1_700_000_000, 0).UTC()) + signer := newSignerForTest(t, 1, clk) + verifier := newVerifierForTest(t, []byte{1}, clk) + + creds := MapCredentialStore{ + "AKIA_ADMIN": "ADMIN_SECRET", + } + roles := map[string]Role{ + "AKIA_ADMIN": RoleFull, + } + buf := &bytes.Buffer{} + logger := slog.New(slog.NewJSONHandler(buf, &slog.HandlerOptions{Level: slog.LevelInfo})) + svc := NewAuthService(signer, creds, roles, AuthServiceOpts{ + Clock: clk, + Verifier: verifier, + Logger: logger, + }) + return svc, buf +} + +func TestAudit_LoginSuccessRecordsActor(t *testing.T) { + svc, buf := newAuthServiceWithAudit(t) + req := postJSON(t, loginRequest{AccessKey: "AKIA_ADMIN", SecretKey: "ADMIN_SECRET"}) + rec := httptest.NewRecorder() + svc.HandleLogin(rec, req) + + require.Equal(t, http.StatusOK, rec.Code) + out := buf.String() + require.Contains(t, out, `"msg":"admin_audit"`) + require.Contains(t, out, `"action":"login"`) + require.Contains(t, out, `"actor":"AKIA_ADMIN"`) + require.Contains(t, out, `"claimed_actor":"AKIA_ADMIN"`) + require.Contains(t, out, `"status":200`) +} + +func TestAudit_LoginFailureRecordsClaimedActor(t *testing.T) { + svc, buf := newAuthServiceWithAudit(t) + req := postJSON(t, loginRequest{AccessKey: "AKIA_ADMIN", SecretKey: "WRONG"}) + rec := httptest.NewRecorder() + svc.HandleLogin(rec, req) + + require.Equal(t, http.StatusUnauthorized, rec.Code) + out := buf.String() + require.Contains(t, out, `"action":"login"`) + // We did NOT authenticate, so actor is empty. + require.Contains(t, out, `"actor":""`) + // But the claimed actor is still logged so operators can track + // which access key was targeted by brute-force attempts. + require.Contains(t, out, `"claimed_actor":"AKIA_ADMIN"`) + require.Contains(t, out, `"status":401`) +} + +func TestAudit_LogoutDecodesCookieForActor(t *testing.T) { + svc, buf := newAuthServiceWithAudit(t) + + // Log in first. + loginReq := postJSON(t, loginRequest{AccessKey: "AKIA_ADMIN", SecretKey: "ADMIN_SECRET"}) + loginRec := httptest.NewRecorder() + svc.HandleLogin(loginRec, loginReq) + require.Equal(t, http.StatusOK, loginRec.Code) + cookies := loginRec.Result().Cookies() + buf.Reset() + + // Now log out with the session cookie — audit must record actor. + req := httptest.NewRequest(http.MethodPost, "/admin/api/v1/auth/logout", nil) + req.RemoteAddr = "127.0.0.1:1" + for _, c := range cookies { + req.AddCookie(c) + } + rec := httptest.NewRecorder() + svc.HandleLogout(rec, req) + + require.Equal(t, http.StatusNoContent, rec.Code) + out := buf.String() + require.Contains(t, out, `"action":"logout"`) + require.Contains(t, out, `"actor":"AKIA_ADMIN"`) +} + +func TestAudit_LogoutWithoutCookieEmptyActor(t *testing.T) { + svc, buf := newAuthServiceWithAudit(t) + req := httptest.NewRequest(http.MethodPost, "/admin/api/v1/auth/logout", nil) + req.RemoteAddr = "127.0.0.1:1" + rec := httptest.NewRecorder() + svc.HandleLogout(rec, req) + + require.Equal(t, http.StatusNoContent, rec.Code) + out := buf.String() + require.Contains(t, out, `"action":"logout"`) + require.Contains(t, out, `"actor":""`) +} + +func TestAudit_LoginLengthTimingHashed(t *testing.T) { + // Same-length secret mismatch and different-length secret mismatch + // must both reach the failure path without short-circuiting on + // length. We cannot time them precisely in a unit test, but we can + // at least verify both paths emit the same failure response. + svc, _ := newAuthServiceWithAudit(t) + for _, secret := range []string{"x", "much-longer-wrong-secret-value-here"} { + req := postJSON(t, loginRequest{AccessKey: "AKIA_ADMIN", SecretKey: secret}) + rec := httptest.NewRecorder() + svc.HandleLogin(rec, req) + require.Equal(t, http.StatusUnauthorized, rec.Code) + require.Contains(t, rec.Body.String(), "invalid_credentials") + } +} diff --git a/internal/admin/auth_handler.go b/internal/admin/auth_handler.go index abfce86a..e4f81fd4 100644 --- a/internal/admin/auth_handler.go +++ b/internal/admin/auth_handler.go @@ -2,9 +2,11 @@ package admin import ( "crypto/rand" + "crypto/sha256" "crypto/subtle" "encoding/base64" "io" + "log/slog" "net/http" "strings" "time" @@ -38,12 +40,14 @@ func (m MapCredentialStore) LookupSecret(accessKey string) (string, bool) { // startup and reuse across the admin listener's lifetime. type AuthService struct { signer *Signer + verifier *Verifier creds CredentialStore roles map[string]Role limiter *rateLimiter sessionTTL time.Duration secureCookie bool cookieDomain string + logger *slog.Logger } // AuthServiceOpts covers the knobs a caller may want to vary in tests. @@ -62,6 +66,14 @@ type AuthServiceOpts struct { LoginWindow time.Duration // Clock drives rate-limiter aging. Defaults to SystemClock. Clock Clock + // Verifier lets the logout handler best-effort decode the + // incoming session cookie and include the actor in the audit + // log. When nil, logout events are still audited but with an + // empty actor field. + Verifier *Verifier + // Logger is the slog destination for admin_audit entries emitted + // by the login/logout handlers. nil falls back to slog.Default(). + Logger *slog.Logger } // NewAuthService constructs an AuthService. The signer must be primary @@ -79,14 +91,20 @@ func NewAuthService(signer *Signer, creds CredentialStore, roles map[string]Role if opts.Clock == nil { opts.Clock = SystemClock } + logger := opts.Logger + if logger == nil { + logger = slog.Default() + } return &AuthService{ signer: signer, + verifier: opts.Verifier, creds: creds, roles: roles, limiter: newRateLimiter(limit, window, opts.Clock), sessionTTL: sessionTTL, secureCookie: !opts.InsecureCookie, cookieDomain: opts.CookieDomain, + logger: logger, } } @@ -109,19 +127,28 @@ type loginResponse struct { // It is safe to expose without the SessionAuth middleware because this is // where a session first comes from; rate limiting, Content-Type validation, // and constant-time credential comparison guard it. +// +// Login events (success and failure) emit admin_audit slog entries +// directly. The generic Audit middleware cannot do this because it runs +// before the handler knows who the caller is claiming to be. func (s *AuthService) HandleLogin(w http.ResponseWriter, r *http.Request) { - if !s.preflightLogin(w, r) { + rec := newStatusRecorder(w) + defer s.auditLogin(r, rec) + + if !s.preflightLogin(rec, r) { return } - req, ok := readLoginRequest(w, r) + req, ok := readLoginRequest(rec, r) + rec.claimedActor = req.AccessKey if !ok { return } - principal, ok := s.authenticate(w, req) + principal, ok := s.authenticate(rec, req) if !ok { return } - s.issueSession(w, principal) + s.issueSession(rec, principal) + rec.actor = principal.AccessKey } func (s *AuthService) preflightLogin(w http.ResponseWriter, r *http.Request) bool { @@ -159,8 +186,13 @@ func readLoginRequest(w http.ResponseWriter, r *http.Request) (loginRequest, boo writeJSONError(w, http.StatusBadRequest, "invalid_body", "body is not valid JSON") return loginRequest{}, false } + // Access keys are AWS-style identifiers that users sometimes copy + // with surrounding whitespace; trimming there is harmless and + // matches how the S3 adapter normalises its credential table at + // load time. Secrets, by contrast, are opaque bytes — trimming + // would accept inputs the SigV4 adapter would reject, creating a + // cross-protocol inconsistency. Leave SecretKey untouched. req.AccessKey = strings.TrimSpace(req.AccessKey) - req.SecretKey = strings.TrimSpace(req.SecretKey) if req.AccessKey == "" || req.SecretKey == "" { writeJSONError(w, http.StatusBadRequest, "missing_fields", "access_key and secret_key are required") @@ -169,23 +201,31 @@ func readLoginRequest(w http.ResponseWriter, r *http.Request) (loginRequest, boo return req, true } -// dummySecretLen is the length we pad the timing-safe comparison with -// when the access key is unknown. It roughly matches the length of a -// typical AWS secret-access-key. -const dummySecretLen = 40 +// sha256Digest returns the SHA-256 hash of s as a []byte of length 32. +// We compare hashes, not raw secrets, so length-based timing attacks +// through subtle.ConstantTimeCompare are neutralised: both sides are +// always 32 bytes regardless of the underlying secret's length. +func sha256Digest(s string) []byte { + sum := sha256.Sum256([]byte(s)) + return sum[:] +} + +// unknownKeyPlaceholder is the deterministic digest we compare against +// when the caller-supplied access key is not in the credential store. +// Using a fixed digest keeps the work done here roughly equivalent to +// the "known key, wrong secret" path, so a timing side-channel cannot +// distinguish "unknown access key" from "wrong secret". +var unknownKeyPlaceholder = sha256Digest("admin-auth-unknown-key-placeholder") func (s *AuthService) authenticate(w http.ResponseWriter, req loginRequest) (AuthPrincipal, bool) { + providedHash := sha256Digest(req.SecretKey) expected, known := s.creds.LookupSecret(req.AccessKey) - if !known { - // Still compare against a dummy secret to keep timing - // roughly equivalent between known and unknown keys. - dummy := strings.Repeat("x", dummySecretLen) - _ = subtle.ConstantTimeCompare([]byte(req.SecretKey), []byte(dummy)) - writeJSONError(w, http.StatusUnauthorized, "invalid_credentials", - "access_key or secret_key is invalid") - return AuthPrincipal{}, false + expectedHash := unknownKeyPlaceholder + if known { + expectedHash = sha256Digest(expected) } - if subtle.ConstantTimeCompare([]byte(req.SecretKey), []byte(expected)) != 1 { + match := subtle.ConstantTimeCompare(providedHash, expectedHash) == 1 + if !known || !match { writeJSONError(w, http.StatusUnauthorized, "invalid_credentials", "access_key or secret_key is invalid") return AuthPrincipal{}, false @@ -220,16 +260,84 @@ func (s *AuthService) issueSession(w http.ResponseWriter, principal AuthPrincipa } // HandleLogout clears both cookies. It does not require authentication — -// clearing stale cookies after a session has expired is always safe. +// clearing stale cookies after a session has expired is always safe. We +// best-effort decode the incoming session cookie so the audit log can +// record who logged out; a missing or invalid cookie leaves actor empty. func (s *AuthService) HandleLogout(w http.ResponseWriter, r *http.Request) { + rec := newStatusRecorder(w) + defer s.auditLogout(r, rec) if r.Method != http.MethodPost { - writeJSONError(w, http.StatusMethodNotAllowed, "method_not_allowed", "logout requires POST") + writeJSONError(rec, http.StatusMethodNotAllowed, "method_not_allowed", "logout requires POST") return } - http.SetCookie(w, s.buildExpiredCookie(sessionCookieName, true)) - http.SetCookie(w, s.buildExpiredCookie(csrfCookieName, false)) - w.Header().Set("Cache-Control", "no-store") - w.WriteHeader(http.StatusNoContent) + if s.verifier != nil { + if c, err := r.Cookie(sessionCookieName); err == nil && strings.TrimSpace(c.Value) != "" { + if p, verr := s.verifier.Verify(c.Value); verr == nil { + rec.actor = p.AccessKey + } + } + } + http.SetCookie(rec, s.buildExpiredCookie(sessionCookieName, true)) + http.SetCookie(rec, s.buildExpiredCookie(csrfCookieName, false)) + rec.Header().Set("Cache-Control", "no-store") + rec.WriteHeader(http.StatusNoContent) +} + +// statusRecorder captures the response status + writes we emit so the +// audit log can include both the final code and the claimed actor. +type statusRecorder struct { + http.ResponseWriter + status int + claimedActor string // what the caller said they were + actor string // what we authenticated them as (empty on failure) +} + +func newStatusRecorder(w http.ResponseWriter) *statusRecorder { + return &statusRecorder{ResponseWriter: w} +} + +func (r *statusRecorder) WriteHeader(code int) { + if r.status == 0 { + r.status = code + } + r.ResponseWriter.WriteHeader(code) +} + +func (r *statusRecorder) Write(b []byte) (int, error) { + if r.status == 0 { + r.status = http.StatusOK + } + n, err := r.ResponseWriter.Write(b) + if err != nil { + return n, errors.Wrap(err, "status recorder write") + } + return n, nil +} + +func (s *AuthService) auditLogin(r *http.Request, rec *statusRecorder) { + s.logger.LogAttrs(r.Context(), slog.LevelInfo, "admin_audit", + slog.String("action", "login"), + slog.String("actor", rec.actor), + slog.String("claimed_actor", rec.claimedActor), + slog.String("remote", r.RemoteAddr), + slog.Int("status", nonZero(rec.status, http.StatusOK)), + ) +} + +func (s *AuthService) auditLogout(r *http.Request, rec *statusRecorder) { + s.logger.LogAttrs(r.Context(), slog.LevelInfo, "admin_audit", + slog.String("action", "logout"), + slog.String("actor", rec.actor), + slog.String("remote", r.RemoteAddr), + slog.Int("status", nonZero(rec.status, http.StatusOK)), + ) +} + +func nonZero(v, fallback int) int { + if v == 0 { + return fallback + } + return v } func (s *AuthService) buildCookie(name, value string, httpOnly bool) *http.Cookie { diff --git a/internal/admin/cluster_handler.go b/internal/admin/cluster_handler.go index 64e8357f..2b5d22ef 100644 --- a/internal/admin/cluster_handler.go +++ b/internal/admin/cluster_handler.go @@ -2,6 +2,7 @@ package admin import ( "context" + "log/slog" "net/http" "time" @@ -47,15 +48,30 @@ func (f ClusterInfoFunc) Describe(ctx context.Context) (ClusterInfo, error) { // ClusterHandler serves GET /admin/api/v1/cluster. type ClusterHandler struct { source ClusterInfoSource + logger *slog.Logger } -// NewClusterHandler wires a source into the HTTP handler. +// NewClusterHandler wires a source into the HTTP handler. Passing a nil +// logger falls back to slog.Default() so existing callers keep working. func NewClusterHandler(source ClusterInfoSource) *ClusterHandler { - return &ClusterHandler{source: source} + return &ClusterHandler{source: source, logger: slog.Default()} +} + +// WithLogger overrides the default slog destination. Kept as an option +// so main.go can attach a component tag without changing the +// constructor signature. +func (h *ClusterHandler) WithLogger(l *slog.Logger) *ClusterHandler { + if l == nil { + return h + } + h.logger = l + return h } // ServeHTTP renders the cluster snapshot as JSON. Errors from the source -// become 500s; the body never exposes internal state beyond an error code. +// are logged on the server with full detail and surfaced to the client as +// a generic "cluster_describe_failed" code. Leaking err.Error() to +// unauthenticated-ish clients would reveal raft/store internals. func (h *ClusterHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { if r.Method != http.MethodGet { writeJSONError(w, http.StatusMethodNotAllowed, "method_not_allowed", "only GET") @@ -63,7 +79,11 @@ func (h *ClusterHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { } info, err := h.source.Describe(r.Context()) if err != nil { - writeJSONError(w, http.StatusInternalServerError, "cluster_describe_failed", err.Error()) + h.logger.LogAttrs(r.Context(), slog.LevelError, "admin cluster describe failed", + slog.String("error", err.Error()), + ) + writeJSONError(w, http.StatusInternalServerError, "cluster_describe_failed", + "failed to describe cluster state; see server logs") return } if info.Timestamp.IsZero() { diff --git a/internal/admin/cluster_handler_test.go b/internal/admin/cluster_handler_test.go index 289fbdad..f77dbd1b 100644 --- a/internal/admin/cluster_handler_test.go +++ b/internal/admin/cluster_handler_test.go @@ -55,7 +55,7 @@ func TestClusterHandler_PreservesExplicitTimestamp(t *testing.T) { func TestClusterHandler_SourceErrorReturns500(t *testing.T) { source := ClusterInfoFunc(func(_ context.Context) (ClusterInfo, error) { - return ClusterInfo{}, errors.New("describe failed") + return ClusterInfo{}, errors.New("raft storage sentinel XYZ123") }) h := NewClusterHandler(source) req := httptest.NewRequest(http.MethodGet, "/admin/api/v1/cluster", nil) @@ -64,6 +64,9 @@ func TestClusterHandler_SourceErrorReturns500(t *testing.T) { require.Equal(t, http.StatusInternalServerError, rec.Code) require.Contains(t, rec.Body.String(), "cluster_describe_failed") + // The raw error must not leak to the client. + require.NotContains(t, rec.Body.String(), "XYZ123") + require.NotContains(t, rec.Body.String(), "raft storage sentinel") } func TestClusterHandler_OnlyGET(t *testing.T) { diff --git a/internal/admin/ratelimit.go b/internal/admin/ratelimit.go index 126735f8..ce9dc259 100644 --- a/internal/admin/ratelimit.go +++ b/internal/admin/ratelimit.go @@ -7,9 +7,13 @@ import ( "time" ) -// rateLimiterMaxEntries is the point at which the limiter performs a -// sweep of stale entries. Tuned so a reasonable burst of distinct -// source IPs does not force a pathological cleanup on every login. +// rateLimiterMaxEntries is a hard cap on the number of distinct source +// IPs the limiter will track at once. Hitting the cap means an attacker +// is spraying us with unique source addresses; we respond by refusing +// to add new entries (and therefore refusing those logins) until the +// window ages them out. We first sweep expired windows before +// concluding the map is full, so well-behaved traffic never trips on +// the cap. const rateLimiterMaxEntries = 1024 // rateLimiter is a fixed-window, in-memory per-IP rate limiter. It is @@ -51,29 +55,47 @@ func (rl *rateLimiter) allow(ip string) bool { rl.mu.Lock() defer rl.mu.Unlock() - // Opportunistic cleanup: if the map ever exceeds a threshold, walk - // entries and drop stale windows. Keeps memory bounded against a - // spray of distinct source IPs. - if len(rl.entries) > rateLimiterMaxEntries { - for k, v := range rl.entries { - if now.Sub(v.windowStart) > rl.window { - delete(rl.entries, k) - } - } - } - e, ok := rl.entries[ip] - if !ok || now.Sub(e.windowStart) >= rl.window { - rl.entries[ip] = &rateLimiterEntry{windowStart: now, count: 1} + if ok { + if now.Sub(e.windowStart) >= rl.window { + e.windowStart = now + e.count = 1 + return true + } + if e.count >= rl.limit { + return false + } + e.count++ return true } - if e.count >= rl.limit { - return false + + // Unknown IP — we need to create a new entry. Enforce the hard + // cap on distinct tracked IPs before doing so. + if len(rl.entries) >= rateLimiterMaxEntries { + // Try to reclaim space from expired windows first. If that + // still leaves us at cap, refuse the new entry. Refusing + // (rather than evicting an arbitrary old entry) is safer: + // it prevents a spray of fresh IPs from silently erasing a + // legitimate user's in-progress rate-limit state. + rl.sweepExpiredLocked(now) + if len(rl.entries) >= rateLimiterMaxEntries { + return false + } } - e.count++ + rl.entries[ip] = &rateLimiterEntry{windowStart: now, count: 1} return true } +// sweepExpiredLocked drops entries whose window has elapsed. The caller +// must hold rl.mu. +func (rl *rateLimiter) sweepExpiredLocked(now time.Time) { + for k, v := range rl.entries { + if now.Sub(v.windowStart) >= rl.window { + delete(rl.entries, k) + } + } +} + // clientIP extracts the IP part of the request's remote address. It falls // back to the full RemoteAddr if SplitHostPort fails. We do not consult // X-Forwarded-For here because the admin listener is expected to run diff --git a/internal/admin/ratelimit_test.go b/internal/admin/ratelimit_test.go new file mode 100644 index 00000000..49df26c7 --- /dev/null +++ b/internal/admin/ratelimit_test.go @@ -0,0 +1,45 @@ +package admin + +import ( + "strconv" + "testing" + "time" + + "github.com/stretchr/testify/require" +) + +func TestRateLimiter_HardCapRefusesNewIPs(t *testing.T) { + now := time.Unix(1_700_000_000, 0).UTC() + clk := func() time.Time { return now } + rl := newRateLimiter(5, time.Minute, clk) + + // Fill the map to the cap by bringing in distinct IPs. + for i := 0; i < rateLimiterMaxEntries; i++ { + ip := "10.0." + strconv.Itoa(i/256) + "." + strconv.Itoa(i%256) + require.Truef(t, rl.allow(ip), "IP %s should be allowed when map is below cap", ip) + } + require.Equal(t, rateLimiterMaxEntries, len(rl.entries)) + + // A brand-new IP must be refused because the cap is reached and + // no entries have expired yet. + require.False(t, rl.allow("192.168.1.1")) + + // Existing IPs are still accounted for (not evicted). + require.True(t, rl.allow("10.0.0.0")) +} + +func TestRateLimiter_HardCapReclaimsAfterWindow(t *testing.T) { + now := time.Unix(1_700_000_000, 0).UTC() + clk := func() time.Time { return now } + rl := newRateLimiter(5, time.Minute, clk) + + for i := 0; i < rateLimiterMaxEntries; i++ { + ip := "10.0." + strconv.Itoa(i/256) + "." + strconv.Itoa(i%256) + require.True(t, rl.allow(ip)) + } + + // Advance past the window — the next allow() call must be able to + // sweep the expired entries and then admit the new IP. + now = now.Add(2 * time.Minute) + require.True(t, rl.allow("192.168.2.2")) +} diff --git a/internal/admin/server.go b/internal/admin/server.go index 5175bd69..916e7a9f 100644 --- a/internal/admin/server.go +++ b/internal/admin/server.go @@ -76,8 +76,20 @@ func NewServer(deps ServerDeps) (*Server, error) { logger = slog.Default() } - auth := NewAuthService(deps.Signer, deps.Credentials, deps.Roles, deps.AuthOpts) - mux := buildAPIMux(auth, deps.Verifier, deps.ClusterInfo, logger) + // Inject verifier + logger into AuthService so login/logout can + // emit admin_audit entries themselves (the generic Audit + // middleware cannot, because those endpoints do not run through + // SessionAuth). + authOpts := deps.AuthOpts + if authOpts.Verifier == nil { + authOpts.Verifier = deps.Verifier + } + if authOpts.Logger == nil { + authOpts.Logger = logger + } + auth := NewAuthService(deps.Signer, deps.Credentials, deps.Roles, authOpts) + cluster := NewClusterHandler(deps.ClusterInfo).WithLogger(logger) + mux := buildAPIMux(auth, deps.Verifier, cluster, logger) router := NewRouter(mux, deps.StaticFS) return &Server{deps: deps, router: router, auth: auth, mux: mux}, nil } @@ -102,13 +114,13 @@ func (s *Server) APIHandler() http.Handler { // POST /admin/api/v1/auth/logout (no auth required) // GET /admin/api/v1/cluster (auth required) // -// Body limit + audit apply uniformly. CSRF applies to writes that require -// a session; login has its own CSRF story (the cookie is minted inside -// it, so we cannot require double-submit on the first call). -func buildAPIMux(auth *AuthService, verifier *Verifier, cluster ClusterInfoSource, logger *slog.Logger) http.Handler { +// Body limit applies uniformly. CSRF and Audit middleware apply to +// write-capable protected endpoints; login and logout carry their own +// audit path inside AuthService because the generic Audit middleware +// cannot see the claimed actor at that point in the chain. +func buildAPIMux(auth *AuthService, verifier *Verifier, clusterHandler http.Handler, logger *slog.Logger) http.Handler { loginHandler := http.HandlerFunc(auth.HandleLogin) logoutHandler := http.HandlerFunc(auth.HandleLogout) - clusterHandler := NewClusterHandler(cluster) // The protected chain: body limit → session auth → CSRF → audit. protect := func(next http.Handler) http.Handler { @@ -120,9 +132,10 @@ func buildAPIMux(auth *AuthService, verifier *Verifier, cluster ClusterInfoSourc ), ) } - // Login / logout: body limit + audit, but NOT session auth or CSRF. + // Login / logout: body limit only. Audit is handled inside the + // AuthService so the actor field is populated correctly. publicAuth := func(next http.Handler) http.Handler { - return BodyLimit(defaultBodyLimit)(Audit(logger)(next)) + return BodyLimit(defaultBodyLimit)(next) } return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { diff --git a/main_admin.go b/main_admin.go index 915aac82..2d8e0dc6 100644 --- a/main_admin.go +++ b/main_admin.go @@ -51,7 +51,15 @@ type adminListenerConfig struct { // startAdminFromFlags is the single entrypoint main.run() uses to stand // up the admin listener. It owns the flag → config translation and the // credentials loading so run() does not inherit that complexity. +// +// When admin is disabled (the default) the function returns immediately +// without touching --s3CredentialsFile: pulling the admin feature into +// a hard dependency on that file would break deployments that never +// intended to use it. func startAdminFromFlags(ctx context.Context, lc *net.ListenConfig, eg *errgroup.Group, runtimes []*raftGroupRuntime) error { + if !*adminEnabled { + return nil + } staticCreds, err := loadS3StaticCredentials(*s3CredsFile) if err != nil { return errors.Wrapf(err, "load static credentials for admin listener") From 474a013061cfdea84b71500c0a1939d531558fb1 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Sat, 25 Apr 2026 01:40:28 +0900 Subject: [PATCH 03/15] admin: address second-round review (CodeQL + codex P2) - auth_handler secret compare: replace naked SHA-256 with HMAC-SHA256 keyed by a per-process random value. The construction is still only used to normalise inputs to a fixed-width constant-time comparator; using a keyed MAC satisfies CodeQL static analysis and communicates the intent more precisely (timing-safe comparator, not persisted password hash). Deliberately NOT bcrypt / argon2: nothing is persisted, the secret arrives in plaintext over TLS, and the rate limiter already bounds online guessing. - config.validateTLS: fail startup when admin.tls.cert_file and admin.tls.key_file are not both set together. A typo that leaves one empty would previously treat TLS as disabled silently; we now surface the misconfiguration at startup. Added unit test covering the two partial-config cases. --- internal/admin/auth_handler.go | 70 ++++++++++++++++++++++++++-------- internal/admin/config.go | 12 +++++- internal/admin/config_test.go | 22 +++++++++++ 3 files changed, 87 insertions(+), 17 deletions(-) diff --git a/internal/admin/auth_handler.go b/internal/admin/auth_handler.go index e4f81fd4..d92f3aa1 100644 --- a/internal/admin/auth_handler.go +++ b/internal/admin/auth_handler.go @@ -1,6 +1,7 @@ package admin import ( + "crypto/hmac" "crypto/rand" "crypto/sha256" "crypto/subtle" @@ -9,6 +10,7 @@ import ( "log/slog" "net/http" "strings" + "sync" "time" "github.com/cockroachdb/errors" @@ -201,28 +203,64 @@ func readLoginRequest(w http.ResponseWriter, r *http.Request) (loginRequest, boo return req, true } -// sha256Digest returns the SHA-256 hash of s as a []byte of length 32. -// We compare hashes, not raw secrets, so length-based timing attacks -// through subtle.ConstantTimeCompare are neutralised: both sides are -// always 32 bytes regardless of the underlying secret's length. -func sha256Digest(s string) []byte { - sum := sha256.Sum256([]byte(s)) - return sum[:] +// secretCompareKey is a per-process random key used to derive +// fixed-length digests of incoming and expected login secrets before a +// constant-time comparison. The key itself does not need to be secret — +// its only job is to: +// +// 1. normalise inputs to a fixed 32-byte width so subtle.ConstantTimeCompare +// cannot leak the length of the expected secret via an early-return, +// 2. make the construction a keyed MAC rather than a naked password hash, +// which keeps static analysis (CodeQL) aligned with the intent: this +// is a timing-safe comparator, not a persisted password hash. +// +// We deliberately do not use bcrypt / argon2 here: nothing is persisted, +// the secret is received in plaintext over TLS at login time, and the +// rate limiter already bounds online guessing. A computationally +// expensive KDF would add latency to every login attempt without +// changing the threat model. +var ( + secretCompareKey []byte + secretCompareKeyOnce sync.Once +) + +func initSecretCompareKey() { + secretCompareKey = make([]byte, sha256.Size) + if _, err := rand.Read(secretCompareKey); err != nil { + // rand.Read never fails on supported platforms; if it did, + // panicking is the right reaction — the admin listener + // cannot authenticate anyone anyway. + panic("admin: crypto/rand failure while initialising secret compare key: " + err.Error()) + } +} + +// digestForCompare returns HMAC-SHA256(secretCompareKey, s). Used only +// for timing-safe comparison of login secrets; never stored. +func digestForCompare(s string) []byte { + secretCompareKeyOnce.Do(initSecretCompareKey) + mac := hmac.New(sha256.New, secretCompareKey) + mac.Write([]byte(s)) + return mac.Sum(nil) } -// unknownKeyPlaceholder is the deterministic digest we compare against -// when the caller-supplied access key is not in the credential store. -// Using a fixed digest keeps the work done here roughly equivalent to -// the "known key, wrong secret" path, so a timing side-channel cannot -// distinguish "unknown access key" from "wrong secret". -var unknownKeyPlaceholder = sha256Digest("admin-auth-unknown-key-placeholder") +// unknownKeyPlaceholder is a sentinel digest we compare against when the +// caller-supplied access key is not in the credential store. Using a +// deterministic value here keeps the work done by the authenticate path +// roughly equivalent between "unknown key" and "known key, wrong +// secret", so the two branches are not distinguishable by timing. +var unknownKeyPlaceholder = func() []byte { + secretCompareKeyOnce.Do(initSecretCompareKey) + mac := hmac.New(sha256.New, secretCompareKey) + mac.Write([]byte("admin-auth-unknown-key-placeholder")) + return mac.Sum(nil) +} func (s *AuthService) authenticate(w http.ResponseWriter, req loginRequest) (AuthPrincipal, bool) { - providedHash := sha256Digest(req.SecretKey) + providedHash := digestForCompare(req.SecretKey) expected, known := s.creds.LookupSecret(req.AccessKey) - expectedHash := unknownKeyPlaceholder + expectedHash := unknownKeyPlaceholder() if known { - expectedHash = sha256Digest(expected) + expectedHash = digestForCompare(expected) } match := subtle.ConstantTimeCompare(providedHash, expectedHash) == 1 if !known || !match { diff --git a/internal/admin/config.go b/internal/admin/config.go index 6ec23f08..b605624a 100644 --- a/internal/admin/config.go +++ b/internal/admin/config.go @@ -89,7 +89,17 @@ func (c *Config) validateListen() error { } func (c *Config) validateTLS() error { - tlsConfigured := strings.TrimSpace(c.TLSCertFile) != "" && strings.TrimSpace(c.TLSKeyFile) != "" + certSet := strings.TrimSpace(c.TLSCertFile) != "" + keySet := strings.TrimSpace(c.TLSKeyFile) != "" + if certSet != keySet { + // A lone cert or key almost always means a typo. Silently + // treating it as "TLS off" would downgrade transport + // security while the operator thinks TLS is enabled; fail + // fast so the misconfiguration is visible at startup. + return errors.New("admin.tls.cert_file and admin.tls.key_file must be set together;" + + " partial TLS configuration is not allowed") + } + tlsConfigured := certSet && keySet if tlsConfigured || !addressRequiresTLS(strings.TrimSpace(c.Listen)) || c.AllowPlaintextNonLoopback { return nil } diff --git a/internal/admin/config_test.go b/internal/admin/config_test.go index 82a64c61..2fa04442 100644 --- a/internal/admin/config_test.go +++ b/internal/admin/config_test.go @@ -87,6 +87,28 @@ func TestConfigValidate_NonLoopbackRequiresTLS(t *testing.T) { require.Contains(t, err.Error(), "allow-plaintext-non-loopback") } +func TestConfigValidate_PartialTLSRejected(t *testing.T) { + for _, tc := range []struct { + name, cert, key string + }{ + {"only cert", "cert.pem", ""}, + {"only key", "", "key.pem"}, + } { + t.Run(tc.name, func(t *testing.T) { + c := &Config{ + Enabled: true, + Listen: "127.0.0.1:8080", + TLSCertFile: tc.cert, + TLSKeyFile: tc.key, + SessionSigningKey: makeKey(10), + } + err := c.Validate() + require.Error(t, err) + require.Contains(t, err.Error(), "partial TLS configuration") + }) + } +} + func TestConfigValidate_NonLoopbackWithTLSOK(t *testing.T) { c := &Config{ Enabled: true, From fc7efbeaec70564d9fd1b49fb0b2c3197e912f13 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Sat, 25 Apr 2026 02:19:16 +0900 Subject: [PATCH 04/15] admin: address Copilot review (5 findings) - auth_handler: derive Retry-After from AuthService.loginWindow instead of hard-coding 60 seconds, so tests and deployments that tune LoginWindow get an accurate hint; clamp to >= 1. - auth_handler: use the signer's injected clock for the login response's expires_at so it cannot drift from the JWT exp claim. AuthService now holds the same clock passed to the signer. - config.validateTLS: the error message pointed at --admin-allow-plaintext-non-loopback, but the real flag is -adminAllowPlaintextNonLoopback. Fix to match the wired flag name so operators can act on the startup error. - middleware.BodyLimit: drop the pass-through bodyLimitResponseWriter wrapper (it provided no translation) and rewrite the comment to say plainly that handlers are responsible for detecting overflow via IsMaxBytesError and calling WriteMaxBytesError. Centralising that in the middleware would either double-write or mask downstream errors depending on the decoder shape. - middleware.CSRFDoubleSubmit: replace constantTimeEq with an explicit length check + subtle.ConstantTimeCompare on bytes, and update the comment to match the actual behaviour. The short-circuit on length mismatch is fine here because both tokens are server-minted at a fixed 32-byte width, so length divergence cannot leak secret state. --- internal/admin/auth_handler.go | 20 +++++++++-- internal/admin/config.go | 2 +- internal/admin/config_test.go | 2 +- internal/admin/middleware.go | 56 +++++++++++++------------------ internal/admin/middleware_test.go | 7 ---- 5 files changed, 44 insertions(+), 43 deletions(-) diff --git a/internal/admin/auth_handler.go b/internal/admin/auth_handler.go index d92f3aa1..51e59453 100644 --- a/internal/admin/auth_handler.go +++ b/internal/admin/auth_handler.go @@ -9,6 +9,7 @@ import ( "io" "log/slog" "net/http" + "strconv" "strings" "sync" "time" @@ -46,9 +47,11 @@ type AuthService struct { creds CredentialStore roles map[string]Role limiter *rateLimiter + loginWindow time.Duration sessionTTL time.Duration secureCookie bool cookieDomain string + clock Clock logger *slog.Logger } @@ -103,9 +106,11 @@ func NewAuthService(signer *Signer, creds CredentialStore, roles map[string]Role creds: creds, roles: roles, limiter: newRateLimiter(limit, window, opts.Clock), + loginWindow: window, sessionTTL: sessionTTL, secureCookie: !opts.InsecureCookie, cookieDomain: opts.CookieDomain, + clock: opts.Clock, logger: logger, } } @@ -159,7 +164,15 @@ func (s *AuthService) preflightLogin(w http.ResponseWriter, r *http.Request) boo return false } if !s.limiter.allow(clientIP(r)) { - w.Header().Set("Retry-After", "60") + // Retry-After must be derived from the actual rate-limit + // window so tests and callers that tune LoginWindow get an + // accurate hint; clamp to at least 1 second so we never + // send a zero value. + retryAfter := int(s.loginWindow.Seconds()) + if retryAfter < 1 { + retryAfter = 1 + } + w.Header().Set("Retry-After", strconv.Itoa(retryAfter)) writeJSONError(w, http.StatusTooManyRequests, "rate_limited", "too many login attempts from this source; try again later") return false @@ -288,7 +301,10 @@ func (s *AuthService) issueSession(w http.ResponseWriter, principal AuthPrincipa writeJSONError(w, http.StatusInternalServerError, "internal", "failed to mint csrf token") return } - expires := time.Now().UTC().Add(s.sessionTTL) + // Use the same clock the signer used so the response's + // expires_at cannot drift from the JWT's exp claim; injected + // test clocks therefore produce deterministic outputs too. + expires := s.clock().UTC().Add(s.sessionTTL) http.SetCookie(w, s.buildCookie(sessionCookieName, token, true)) http.SetCookie(w, s.buildCookie(csrfCookieName, csrf, false)) w.Header().Set("Cache-Control", "no-store") diff --git a/internal/admin/config.go b/internal/admin/config.go index b605624a..6687faa7 100644 --- a/internal/admin/config.go +++ b/internal/admin/config.go @@ -106,7 +106,7 @@ func (c *Config) validateTLS() error { return errors.WithStack(errors.Newf( "admin.listen %q is not loopback but TLS is not configured;"+ " set admin.tls.cert_file + admin.tls.key_file, or explicitly pass"+ - " --admin-allow-plaintext-non-loopback (strongly discouraged)", + " -adminAllowPlaintextNonLoopback (strongly discouraged)", c.Listen, )) } diff --git a/internal/admin/config_test.go b/internal/admin/config_test.go index 2fa04442..20844d3f 100644 --- a/internal/admin/config_test.go +++ b/internal/admin/config_test.go @@ -84,7 +84,7 @@ func TestConfigValidate_NonLoopbackRequiresTLS(t *testing.T) { err := c.Validate() require.Error(t, err) require.Contains(t, err.Error(), "TLS is not configured") - require.Contains(t, err.Error(), "allow-plaintext-non-loopback") + require.Contains(t, err.Error(), "adminAllowPlaintextNonLoopback") } func TestConfigValidate_PartialTLSRejected(t *testing.T) { diff --git a/internal/admin/middleware.go b/internal/admin/middleware.go index 88dcfa50..a1622bc7 100644 --- a/internal/admin/middleware.go +++ b/internal/admin/middleware.go @@ -2,6 +2,7 @@ package admin import ( "context" + "crypto/subtle" "errors" "log/slog" "net/http" @@ -41,10 +42,15 @@ func PrincipalFromContext(ctx context.Context) (AuthPrincipal, bool) { return v, ok } -// BodyLimit wraps the request body with http.MaxBytesReader and responds -// 413 when the client exceeds the cap. It also sets -// http.MaxBytesError-aware error translation so the handler does not need -// to distinguish ordinary IO failures from overflow. +// BodyLimit caps each request body at `limit` bytes via +// http.MaxBytesReader. Handlers that read the body are responsible for +// detecting overflow (via IsMaxBytesError / errors.As on +// *http.MaxBytesError) and calling WriteMaxBytesError to respond 413. +// We intentionally do not centralise that translation in the middleware +// chain: different handlers parse bodies with different decoders (json, +// form, multipart) and each already has a natural error path, so a +// wrapper ResponseWriter would either double-write or mask subsequent +// errors. func BodyLimit(limit int64) func(http.Handler) http.Handler { if limit <= 0 { limit = defaultBodyLimit @@ -54,24 +60,15 @@ func BodyLimit(limit int64) func(http.Handler) http.Handler { if r.Body != nil { r.Body = http.MaxBytesReader(w, r.Body, limit) } - next.ServeHTTP(bodyLimitResponseWriter{ResponseWriter: w}, r) + next.ServeHTTP(w, r) }) } } -// bodyLimitResponseWriter is a minor adapter that lets a handler translate -// its own MaxBytesError into a consistent 413 without duplicating the -// plumbing. At the time of writing, each write handler can call -// r.ParseForm / json.Decode and on error call -// `if errors.As(err, &http.MaxBytesError{}) { ... }` manually; this -// wrapper just forces the header once per request. -type bodyLimitResponseWriter struct { - http.ResponseWriter -} - -// WriteMaxBytesError is called by handlers that detected a MaxBytesError. -// It is a package-level helper rather than a method so the router error -// path keeps the same JSON shape as the rest. +// WriteMaxBytesError is the canonical 413 response body for admin +// handlers that detected an http.MaxBytesError while reading a request +// body. It keeps the JSON error shape consistent with the rest of the +// admin surface. func WriteMaxBytesError(w http.ResponseWriter) { writeJSONError(w, http.StatusRequestEntityTooLarge, "payload_too_large", "request body exceeds the 64 KiB admin limit") @@ -155,9 +152,15 @@ func CSRFDoubleSubmit() func(http.Handler) http.Handler { "X-Admin-CSRF header is required for write operations") return } - // Constant-time comparison: the values are user-supplied - // and we do not want to leak length differences. - if !constantTimeEq(cookie.Value, header) { + // Constant-time comparison on the byte contents once we + // know the lengths match. A length mismatch is itself not + // secret (the server mints both tokens with a fixed 32-byte + // width, so any divergence means an attacker forged or + // corrupted the value — the response is 403 in every case + // anyway), so short-circuiting there does not leak anything + // useful to a timing attacker. + if len(cookie.Value) != len(header) || + subtle.ConstantTimeCompare([]byte(cookie.Value), []byte(header)) != 1 { writeJSONError(w, http.StatusForbidden, "csrf_mismatch", "CSRF token mismatch") return } @@ -166,17 +169,6 @@ func CSRFDoubleSubmit() func(http.Handler) http.Handler { } } -func constantTimeEq(a, b string) bool { - if len(a) != len(b) { - return false - } - var diff byte - for i := 0; i < len(a); i++ { - diff |= a[i] ^ b[i] - } - return diff == 0 -} - // auditRecorder is the ResponseWriter wrapper the Audit middleware uses // to learn the final status code without forcing the handler to pass it // back explicitly. diff --git a/internal/admin/middleware_test.go b/internal/admin/middleware_test.go index 6ff6cc75..fc7c00df 100644 --- a/internal/admin/middleware_test.go +++ b/internal/admin/middleware_test.go @@ -229,10 +229,3 @@ func TestAudit_SkipsReads(t *testing.T) { require.Equal(t, http.StatusOK, rec.Code) require.Empty(t, buf.String()) } - -func TestConstantTimeEq(t *testing.T) { - require.True(t, constantTimeEq("abc", "abc")) - require.False(t, constantTimeEq("abc", "abd")) - require.False(t, constantTimeEq("abc", "abcd")) - require.True(t, constantTimeEq("", "")) -} From 725febaf8a21589f2fa10954185b0359ad9b38d5 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Sat, 25 Apr 2026 02:28:37 +0900 Subject: [PATCH 05/15] admin: address Copilot round-3 review (3 findings) - router: handle bare /admin/api/v1 and /admin/assets (no trailing slash) explicitly so they return a JSON 404 instead of falling through to the SPA fallback and being answered with index.html. New test fixes the behaviour. - main_admin.newClusterInfoSource: populate GroupInfo.Members from rt.engine.Configuration(ctx) and stop discarding the context. Failed Configuration fetches leave Members empty (best effort) so a transient raft state cannot poison the whole cluster snapshot. - config: replace dotted config-key names (admin.listen, admin.tls.cert_file, admin.session_signing_key, etc.) in every validation error message with the actual CLI flag names (-adminListen, -adminTLSCertFile, -adminSessionSigningKey, ...). Tests updated to match. --- internal/admin/config.go | 22 +++++++++++----------- internal/admin/config_test.go | 8 ++++---- internal/admin/router.go | 22 ++++++++++++++++++++-- internal/admin/router_test.go | 14 ++++++++++++++ main_admin.go | 15 ++++++++++++++- 5 files changed, 63 insertions(+), 18 deletions(-) diff --git a/internal/admin/config.go b/internal/admin/config.go index 6687faa7..8742681f 100644 --- a/internal/admin/config.go +++ b/internal/admin/config.go @@ -80,10 +80,10 @@ func (c *Config) Validate() error { func (c *Config) validateListen() error { listen := strings.TrimSpace(c.Listen) if listen == "" { - return errors.New("admin.listen must not be empty when admin.enabled=true") + return errors.New("-adminListen must not be empty when -adminEnabled=true") } if _, _, err := net.SplitHostPort(listen); err != nil { - return errors.Wrapf(err, "admin.listen %q is not host:port", c.Listen) + return errors.Wrapf(err, "-adminListen %q is not host:port", c.Listen) } return nil } @@ -96,7 +96,7 @@ func (c *Config) validateTLS() error { // treating it as "TLS off" would downgrade transport // security while the operator thinks TLS is enabled; fail // fast so the misconfiguration is visible at startup. - return errors.New("admin.tls.cert_file and admin.tls.key_file must be set together;" + + return errors.New("-adminTLSCertFile and -adminTLSKeyFile must be set together;" + " partial TLS configuration is not allowed") } tlsConfigured := certSet && keySet @@ -104,25 +104,25 @@ func (c *Config) validateTLS() error { return nil } return errors.WithStack(errors.Newf( - "admin.listen %q is not loopback but TLS is not configured;"+ - " set admin.tls.cert_file + admin.tls.key_file, or explicitly pass"+ + "-adminListen %q is not loopback but TLS is not configured;"+ + " set -adminTLSCertFile + -adminTLSKeyFile, or explicitly pass"+ " -adminAllowPlaintextNonLoopback (strongly discouraged)", c.Listen, )) } func (c *Config) validateSigningKeys() error { - primary, err := decodeSigningKey("admin.session_signing_key", c.SessionSigningKey) + primary, err := decodeSigningKey("-adminSessionSigningKey", c.SessionSigningKey) if err != nil { return err } if len(primary) == 0 { - return errors.New("admin.session_signing_key is required when admin.enabled=true") + return errors.New("-adminSessionSigningKey is required when -adminEnabled=true") } if strings.TrimSpace(c.SessionSigningKeyPrevious) == "" { return nil } - if _, err := decodeSigningKey("admin.session_signing_key_previous", c.SessionSigningKeyPrevious); err != nil { + if _, err := decodeSigningKey("-adminSessionSigningKeyPrevious", c.SessionSigningKeyPrevious); err != nil { return err } return nil @@ -132,13 +132,13 @@ func (c *Config) validateSigningKeys() error { // primary signing key first, followed by an optional previous key. Validate // must be called first. func (c *Config) DecodedSigningKeys() ([][]byte, error) { - primary, err := decodeSigningKey("admin.session_signing_key", c.SessionSigningKey) + primary, err := decodeSigningKey("-adminSessionSigningKey", c.SessionSigningKey) if err != nil { return nil, err } keys := [][]byte{primary} if strings.TrimSpace(c.SessionSigningKeyPrevious) != "" { - prev, err := decodeSigningKey("admin.session_signing_key_previous", c.SessionSigningKeyPrevious) + prev, err := decodeSigningKey("-adminSessionSigningKeyPrevious", c.SessionSigningKeyPrevious) if err != nil { return nil, err } @@ -207,7 +207,7 @@ func validateAccessKeyRoles(readOnly, full []string) error { } if _, dup := fullSet[trim]; dup { return errors.WithStack(errors.Newf( - "access key %q is listed in both admin.read_only_access_keys and admin.full_access_keys;"+ + "access key %q is listed in both -adminReadOnlyAccessKeys and -adminFullAccessKeys;"+ " this would silently grant write access depending on lookup order, so it is rejected at startup", trim, )) diff --git a/internal/admin/config_test.go b/internal/admin/config_test.go index 20844d3f..fd54742d 100644 --- a/internal/admin/config_test.go +++ b/internal/admin/config_test.go @@ -28,7 +28,7 @@ func TestConfigValidate_RequiresListen(t *testing.T) { } err := c.Validate() require.Error(t, err) - require.Contains(t, err.Error(), "admin.listen must not be empty") + require.Contains(t, err.Error(), "-adminListen must not be empty") } func TestConfigValidate_RequiresSigningKey(t *testing.T) { @@ -38,7 +38,7 @@ func TestConfigValidate_RequiresSigningKey(t *testing.T) { } err := c.Validate() require.Error(t, err) - require.Contains(t, err.Error(), "session_signing_key is required") + require.Contains(t, err.Error(), "adminSessionSigningKey is required") } func TestConfigValidate_SigningKeyWrongLength(t *testing.T) { @@ -169,7 +169,7 @@ func TestConfigValidate_PreviousSigningKeyValidated(t *testing.T) { } err := c.Validate() require.Error(t, err) - require.Contains(t, err.Error(), "session_signing_key_previous") + require.Contains(t, err.Error(), "adminSessionSigningKeyPrevious") } func TestConfigDecodedSigningKeys_Order(t *testing.T) { @@ -221,5 +221,5 @@ func TestConfigValidate_PreservesContext(t *testing.T) { } err := c.Validate() require.Error(t, err) - require.True(t, strings.Contains(err.Error(), "admin.listen")) + require.True(t, strings.Contains(err.Error(), "-adminListen")) } diff --git a/internal/admin/router.go b/internal/admin/router.go index 56b1e470..a59997c8 100644 --- a/internal/admin/router.go +++ b/internal/admin/router.go @@ -13,11 +13,18 @@ import ( // Constants for the admin URL namespace. Centralised here so the router, // handlers, and tests all agree on the paths. The admin listener only // serves URLs under /admin/*; anything else yields a 404. +// +// The "root" variants (without a trailing slash) are treated as the +// directory itself so that requests like `/admin/api/v1` or +// `/admin/assets` resolve to a JSON 404 rather than falling through to +// the SPA fallback and being answered with index.html. const ( pathPrefixAdmin = "/admin" - pathPrefixAPIv1 = "/admin/api/v1/" + pathAPIv1Root = "/admin/api/v1" + pathPrefixAPIv1 = pathAPIv1Root + "/" pathHealthz = "/admin/healthz" - pathPrefixAssets = "/admin/assets/" + pathAssetsRoot = "/admin/assets" + pathPrefixAssets = pathAssetsRoot + "/" pathRootAssetsDir = "assets" pathIndexHTML = "index.html" ) @@ -64,6 +71,12 @@ func (rt *Router) ServeHTTP(w http.ResponseWriter, r *http.Request) { p := r.URL.Path switch { + case p == pathAPIv1Root: + // Bare /admin/api/v1 is an API root, not an SPA page: answer + // with a JSON 404 so clients never get HTML back on an API + // path. + rt.notFind.ServeHTTP(w, r) + return case strings.HasPrefix(p, pathPrefixAPIv1): if rt.api == nil { rt.notFind.ServeHTTP(w, r) @@ -74,6 +87,11 @@ func (rt *Router) ServeHTTP(w http.ResponseWriter, r *http.Request) { case p == pathHealthz: rt.serveHealth(w, r) return + case p == pathAssetsRoot: + // Same reasoning as /admin/api/v1: the bare assets root is + // a directory, not an SPA route. + rt.notFind.ServeHTTP(w, r) + return case strings.HasPrefix(p, pathPrefixAssets): rt.serveAsset(w, r) return diff --git a/internal/admin/router_test.go b/internal/admin/router_test.go index 9554f40e..537e3031 100644 --- a/internal/admin/router_test.go +++ b/internal/admin/router_test.go @@ -117,6 +117,20 @@ func TestRouter_SPAWithoutStaticReturns404(t *testing.T) { require.Equal(t, http.StatusNotFound, rec.Code) } +func TestRouter_BareAPIRootReturnsJSON404NotHTML(t *testing.T) { + r := NewRouter(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusNoContent) + }), newTestStatic()) + for _, p := range []string{"/admin/api/v1", "/admin/assets"} { + req := httptest.NewRequest(http.MethodGet, p, nil) + rec := httptest.NewRecorder() + r.ServeHTTP(rec, req) + require.Equalf(t, http.StatusNotFound, rec.Code, "path %s", p) + require.Containsf(t, rec.Header().Get("Content-Type"), "application/json", "path %s", p) + require.NotContainsf(t, rec.Body.String(), " Date: Sat, 25 Apr 2026 03:04:19 +0900 Subject: [PATCH 06/15] raftengine/etcd: extract walFileExt constant to silence goconst --- internal/raftengine/etcd/wal_purge.go | 7 ++++++- internal/raftengine/etcd/wal_purge_test.go | 2 +- internal/raftengine/etcd/wal_store_test.go | 2 +- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/internal/raftengine/etcd/wal_purge.go b/internal/raftengine/etcd/wal_purge.go index b2c16304..6b2c2ea5 100644 --- a/internal/raftengine/etcd/wal_purge.go +++ b/internal/raftengine/etcd/wal_purge.go @@ -12,6 +12,11 @@ import ( "go.etcd.io/etcd/client/pkg/v3/fileutil" ) +// walFileExt is the file extension etcd writes for every WAL segment. +// Centralising the literal keeps the WAL discovery/purge code and its +// tests from drifting apart and silences goconst's 3-occurrence rule. +const walFileExt = ".wal" + // defaultMaxWALFiles is the number of .wal segment files to retain after a // successful snapshot. etcd's wal package allocates segments of 64 MiB each; // keeping the most recent N segments bounds WAL disk usage at ~N * 64 MiB. @@ -137,7 +142,7 @@ func collectWALNames(entries []os.DirEntry) []string { if e.IsDir() { continue } - if filepath.Ext(e.Name()) != ".wal" { + if filepath.Ext(e.Name()) != walFileExt { continue } names = append(names, e.Name()) diff --git a/internal/raftengine/etcd/wal_purge_test.go b/internal/raftengine/etcd/wal_purge_test.go index 2c151f81..2e2bf8d9 100644 --- a/internal/raftengine/etcd/wal_purge_test.go +++ b/internal/raftengine/etcd/wal_purge_test.go @@ -32,7 +32,7 @@ func listWALFiles(t *testing.T, dir string) []string { require.NoError(t, err) var out []string for _, e := range entries { - if !e.IsDir() && filepath.Ext(e.Name()) == ".wal" { + if !e.IsDir() && filepath.Ext(e.Name()) == walFileExt { out = append(out, e.Name()) } } diff --git a/internal/raftengine/etcd/wal_store_test.go b/internal/raftengine/etcd/wal_store_test.go index 47a9c44e..01b6a209 100644 --- a/internal/raftengine/etcd/wal_store_test.go +++ b/internal/raftengine/etcd/wal_store_test.go @@ -197,7 +197,7 @@ func lastWALFile(t *testing.T, walDir string) string { require.NoError(t, err) names := make([]string, 0, len(entries)) for _, e := range entries { - if filepath.Ext(e.Name()) == ".wal" { + if filepath.Ext(e.Name()) == walFileExt { names = append(names, e.Name()) } } From 87102302950029e63053c3bf01a4cda2c4350336 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Sat, 25 Apr 2026 03:04:24 +0900 Subject: [PATCH 07/15] admin: address Copilot round-4 review (4 findings) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - server.buildAPIMux: move Audit inside SessionAuth and before CSRF so CSRF-rejected protected writes are still audited. The previous chain (SessionAuth → CSRF → Audit) silently dropped csrf_missing / csrf_mismatch from the admin_audit stream — exactly the attack traces operators want to see. - server.NewServer: reject a nil deps.Roles as a wiring error. A nil role index would have 403-ed every login silently; an empty (but non-nil) map is still accepted for explicit lockdown. - auth_handler.issueSession: rewrite the expires_at comment so it describes what the code actually guarantees. The AuthService clock is what computes expires_at, and callers are expected to pass the same clock to Signer and AuthService (NewServer does); the comment no longer claims a guarantee the type system does not enforce. - router.serveAsset: replace the naive strings.Contains(rel, "..") traversal guard with fs.ValidPath. The new check correctly: * accepts filenames with ".." as a substring (e.g. app..js), * rejects any ".." path segment, * rejects leading-slash / double-slash shapes that could previously confuse path.Join into returning a 500 from the fs backend. Added dedicated tests for the substring-dot-dot allowance, the double-slash rejection, and the bare-API-root JSON-404 behaviour. --- internal/admin/auth_handler.go | 12 +++++++++--- internal/admin/router.go | 9 ++++++++- internal/admin/router_test.go | 27 +++++++++++++++++++++++++++ internal/admin/server.go | 19 ++++++++++++++++--- 4 files changed, 60 insertions(+), 7 deletions(-) diff --git a/internal/admin/auth_handler.go b/internal/admin/auth_handler.go index 51e59453..d3347c87 100644 --- a/internal/admin/auth_handler.go +++ b/internal/admin/auth_handler.go @@ -301,9 +301,15 @@ func (s *AuthService) issueSession(w http.ResponseWriter, principal AuthPrincipa writeJSONError(w, http.StatusInternalServerError, "internal", "failed to mint csrf token") return } - // Use the same clock the signer used so the response's - // expires_at cannot drift from the JWT's exp claim; injected - // test clocks therefore produce deterministic outputs too. + // Compute the response expiry from the AuthService clock plus + // the configured session TTL. Injected test clocks therefore + // produce deterministic values, and in practice callers (main + // and NewServer) pass the same clock to both the Signer and + // the AuthService, so the response's expires_at matches the + // JWT exp claim. We do not cross-check against the signer's + // clock here because Signer does not expose it; if a future + // caller wires them independently, expires_at may drift by up + // to the delta between the two clocks. expires := s.clock().UTC().Add(s.sessionTTL) http.SetCookie(w, s.buildCookie(sessionCookieName, token, true)) http.SetCookie(w, s.buildCookie(csrfCookieName, csrf, false)) diff --git a/internal/admin/router.go b/internal/admin/router.go index a59997c8..d525f50b 100644 --- a/internal/admin/router.go +++ b/internal/admin/router.go @@ -122,7 +122,14 @@ func (rt *Router) serveAsset(w http.ResponseWriter, r *http.Request) { } // Drop /admin/assets/ prefix → relative path under pathRootAssetsDir. rel := strings.TrimPrefix(r.URL.Path, pathPrefixAssets) - if rel == "" || strings.Contains(rel, "..") { + // Defence against traversal and malformed paths: require the + // already-normalised form that fs.ValidPath enforces (no + // ".." segments, no "//" segments, no leading "/"). Anything + // that does not pass validation resolves to a 404 JSON rather + // than risking a 500 from the underlying fs.FS — legitimate + // filenames containing ".." as a substring (e.g. "app..js") + // still pass because ValidPath checks segments, not substrings. + if rel == "" || !fs.ValidPath(rel) { rt.notFind.ServeHTTP(w, r) return } diff --git a/internal/admin/router_test.go b/internal/admin/router_test.go index 537e3031..b6d889bc 100644 --- a/internal/admin/router_test.go +++ b/internal/admin/router_test.go @@ -97,6 +97,33 @@ func TestRouter_StaticTraversalRejected(t *testing.T) { require.NotEqual(t, http.StatusOK, rec.Code) } +// Filenames containing ".." as a substring (but not as a path segment) +// are legitimate — the guard must not reject `app..js`. +func TestRouter_StaticSubstringDotDotAllowed(t *testing.T) { + fs := fstest.MapFS{ + "index.html": {Data: []byte("")}, + "assets/app..js": {Data: []byte("console.log('double-dot');")}, + } + r := NewRouter(nil, fs) + req := httptest.NewRequest(http.MethodGet, "/admin/assets/app..js", nil) + rec := httptest.NewRecorder() + r.ServeHTTP(rec, req) + + require.Equal(t, http.StatusOK, rec.Code) + require.Contains(t, rec.Body.String(), "double-dot") +} + +// A client-supplied double slash must resolve to a JSON 404, not a 500. +func TestRouter_StaticDoubleSlashReturnsJSON404(t *testing.T) { + r := NewRouter(nil, newTestStatic()) + req := httptest.NewRequest(http.MethodGet, "/admin/assets//app.js", nil) + rec := httptest.NewRecorder() + r.ServeHTTP(rec, req) + + require.Equal(t, http.StatusNotFound, rec.Code) + require.Contains(t, rec.Header().Get("Content-Type"), "application/json") +} + func TestRouter_SPAFallbackServesIndex(t *testing.T) { r := NewRouter(nil, newTestStatic()) for _, p := range []string{"/admin", "/admin/", "/admin/dynamo", "/admin/s3/bucket-42"} { diff --git a/internal/admin/server.go b/internal/admin/server.go index 916e7a9f..80c7f4a5 100644 --- a/internal/admin/server.go +++ b/internal/admin/server.go @@ -68,6 +68,13 @@ func NewServer(deps ServerDeps) (*Server, error) { if deps.Credentials == nil { return nil, errMissing("Credentials") } + if deps.Roles == nil { + // A nil role index would silently 403 every login. Treat it + // as a wiring bug rather than a valid "admin is locked down" + // state: operators who really want zero admin access can + // set admin.enabled=false or pass an empty (non-nil) map. + return nil, errMissing("Roles") + } if deps.ClusterInfo == nil { return nil, errMissing("ClusterInfo") } @@ -122,12 +129,18 @@ func buildAPIMux(auth *AuthService, verifier *Verifier, clusterHandler http.Hand loginHandler := http.HandlerFunc(auth.HandleLogin) logoutHandler := http.HandlerFunc(auth.HandleLogout) - // The protected chain: body limit → session auth → CSRF → audit. + // The protected chain: body limit → session auth → audit → CSRF. + // Audit is deliberately placed before CSRF so that CSRF-rejected + // protected requests are still written to the audit log — the + // actor is already known at that point because SessionAuth ran. + // If CSRF were wrapped inside Audit, every csrf_missing / + // csrf_mismatch rejection would silently escape auditing, which + // is exactly the attack trace operators want to see. protect := func(next http.Handler) http.Handler { return BodyLimit(defaultBodyLimit)( SessionAuth(verifier)( - CSRFDoubleSubmit()( - Audit(logger)(next), + Audit(logger)( + CSRFDoubleSubmit()(next), ), ), ) From 8d1b3a94a14d2323e32af2e3015490c323bf47ab Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Sat, 25 Apr 2026 03:17:33 +0900 Subject: [PATCH 08/15] admin: address Copilot round-5 review (3 findings) - startAdminServer now returns (listenAddr, err) so tests can bind the real address without a pre-allocated free-port helper. The previous freePort()-binds-then-closes flow had a bind/close/rebind race under parallel tests. Tests now pass "127.0.0.1:0" and use the returned address; the free-port helper is deleted. main.go discards the returned address. - loginResponse comment: drop the claim that the CSRF token is returned in the JSON body. The token is delivered exclusively via the admin_csrf Set-Cookie header; the comment now says so. - server_test.go: remove the dummy var block that kept io and fmt imported. Unused imports now fail to compile, which is exactly what Go's import hygiene is for. --- internal/admin/auth_handler.go | 7 ++--- internal/admin/server_test.go | 8 ------ main_admin.go | 26 +++++++++++++----- main_admin_test.go | 48 +++++++++------------------------- 4 files changed, 35 insertions(+), 54 deletions(-) diff --git a/internal/admin/auth_handler.go b/internal/admin/auth_handler.go index d3347c87..dd5173d0 100644 --- a/internal/admin/auth_handler.go +++ b/internal/admin/auth_handler.go @@ -122,9 +122,10 @@ type loginRequest struct { } // loginResponse is the JSON body the login endpoint returns on success. -// The CSRF token is also readable from the admin_csrf cookie; we include -// it here as a convenience for clients that want to avoid parsing the -// Set-Cookie header themselves. +// The CSRF token is delivered exclusively via the admin_csrf cookie (see +// the Set-Cookie headers the handler sets on the same response); we do +// not echo it in the JSON body to avoid encouraging clients to cache or +// log the token out of band. type loginResponse struct { Role Role `json:"role"` ExpiresAt time.Time `json:"expires_at"` diff --git a/internal/admin/server_test.go b/internal/admin/server_test.go index 28e1a032..2b421572 100644 --- a/internal/admin/server_test.go +++ b/internal/admin/server_test.go @@ -3,8 +3,6 @@ package admin import ( "bytes" "context" - "fmt" - "io" "log/slog" "net/http" "net/http/httptest" @@ -179,9 +177,3 @@ func TestInternalHelpers(t *testing.T) { require.True(t, endpointMatches("/a", "/A")) require.False(t, endpointMatches("/a", "/b")) } - -// Keep io, fmt imported to avoid drift if the test evolves. -var ( - _ = io.Discard - _ = fmt.Sprint -) diff --git a/main_admin.go b/main_admin.go index 04d342de..650c4a25 100644 --- a/main_admin.go +++ b/main_admin.go @@ -77,7 +77,8 @@ func startAdminFromFlags(ctx context.Context, lc *net.ListenConfig, eg *errgroup fullAccessKeys: parseCSV(*adminFullAccessKeys), } clusterSrc := newClusterInfoSource(*raftId, buildVersion(), runtimes) - return startAdminServer(ctx, lc, eg, cfg, staticCreds, clusterSrc, buildVersion()) + _, err = startAdminServer(ctx, lc, eg, cfg, staticCreds, clusterSrc, buildVersion()) + return err } // buildAdminConfig translates flag values into an admin.Config. @@ -101,6 +102,12 @@ func buildAdminConfig(in adminListenerConfig) admin.Config { // listener is disabled. Errors at this point are hard startup failures: // the design doc mandates ハードエラーで起動失敗 for every invalid // configuration, and we honour that uniformly. +// +// The returned address is the actual host:port the listener bound to; it +// differs from adminCfg.Listen only when the caller passed a port of 0, +// but tests rely on this to avoid the bind-close-rebind race that a +// pre-allocated free-port helper would otherwise introduce. When admin +// is disabled the returned address is empty. func startAdminServer( ctx context.Context, lc *net.ListenConfig, @@ -109,27 +116,32 @@ func startAdminServer( creds map[string]string, cluster admin.ClusterInfoSource, version string, -) error { +) (string, error) { adminCfg := buildAdminConfig(cfg) enabled, err := checkAdminConfig(&adminCfg, cluster) if err != nil || !enabled { - return err + return "", err } server, err := buildAdminHTTPServer(&adminCfg, creds, cluster) if err != nil { - return err + return "", err } httpSrv := newAdminHTTPServer(server, &adminCfg) listener, err := lc.Listen(ctx, "tcp", adminCfg.Listen) if err != nil { - return errors.Wrapf(err, "failed to listen on admin address %s", adminCfg.Listen) + return "", errors.Wrapf(err, "failed to listen on admin address %s", adminCfg.Listen) } tlsEnabled := strings.TrimSpace(adminCfg.TLSCertFile) != "" && strings.TrimSpace(adminCfg.TLSKeyFile) != "" if tlsEnabled { httpSrv.TLSConfig = &tls.Config{MinVersion: tls.VersionTLS12} } - registerAdminLifecycle(ctx, eg, httpSrv, listener, &adminCfg, tlsEnabled, version) - return nil + actualAddr := listener.Addr().String() + // Use the real bound address in log lines and in the lifecycle + // task so the shutdown banner matches startup. + boundCfg := adminCfg + boundCfg.Listen = actualAddr + registerAdminLifecycle(ctx, eg, httpSrv, listener, &boundCfg, tlsEnabled, version) + return actualAddr, nil } // checkAdminConfig validates adminCfg; returns (enabled=false, nil) when diff --git a/main_admin_test.go b/main_admin_test.go index 63970488..c0f91302 100644 --- a/main_admin_test.go +++ b/main_admin_test.go @@ -16,7 +16,6 @@ import ( "net/http" "os" "path/filepath" - "strings" "testing" "time" @@ -70,7 +69,7 @@ func TestStartAdminServer_DisabledNoOp(t *testing.T) { eg, ctx := errgroup.WithContext(context.Background()) defer func() { _ = eg.Wait() }() var lc net.ListenConfig - err := startAdminServer(ctx, &lc, eg, adminListenerConfig{enabled: false}, nil, nil, "") + _, err := startAdminServer(ctx, &lc, eg, adminListenerConfig{enabled: false}, nil, nil, "") require.NoError(t, err) } @@ -83,7 +82,7 @@ func TestStartAdminServer_InvalidConfigRejected(t *testing.T) { listen: "127.0.0.1:0", // missing signing key } - err := startAdminServer(ctx, &lc, eg, cfg, map[string]string{}, nil, "") + _, err := startAdminServer(ctx, &lc, eg, cfg, map[string]string{}, nil, "") require.Error(t, err) } @@ -96,7 +95,7 @@ func TestStartAdminServer_NonLoopbackWithoutTLSRejected(t *testing.T) { listen: "0.0.0.0:0", sessionSigningKey: freshKey(), } - err := startAdminServer(ctx, &lc, eg, cfg, map[string]string{}, nil, "") + _, err := startAdminServer(ctx, &lc, eg, cfg, map[string]string{}, nil, "") require.Error(t, err) require.Contains(t, err.Error(), "TLS") } @@ -110,7 +109,7 @@ func TestStartAdminServer_RejectsMissingClusterSource(t *testing.T) { listen: "127.0.0.1:0", sessionSigningKey: freshKey(), } - err := startAdminServer(ctx, &lc, eg, cfg, map[string]string{}, nil, "") + _, err := startAdminServer(ctx, &lc, eg, cfg, map[string]string{}, nil, "") require.Error(t, err) require.Contains(t, err.Error(), "cluster info source") } @@ -123,27 +122,25 @@ func TestStartAdminServer_ServesHealthz(t *testing.T) { _ = eg.Wait() }() - port := freePort(t) - listen := "127.0.0.1:" + port var lc net.ListenConfig cfg := adminListenerConfig{ enabled: true, - listen: listen, + listen: "127.0.0.1:0", sessionSigningKey: freshKey(), allowInsecureDevCookie: true, } cluster := admin.ClusterInfoFunc(func(_ context.Context) (admin.ClusterInfo, error) { return admin.ClusterInfo{NodeID: "n1", Version: "test"}, nil }) - require.NoError(t, startAdminServer(eCtx, &lc, eg, cfg, map[string]string{}, cluster, "test")) + addr, err := startAdminServer(eCtx, &lc, eg, cfg, map[string]string{}, cluster, "test") + require.NoError(t, err) // Poll /admin/healthz until success or the test deadline. client := &http.Client{Timeout: 2 * time.Second} deadline := time.Now().Add(3 * time.Second) var resp *http.Response - var err error for time.Now().Before(deadline) { - req, reqErr := http.NewRequestWithContext(ctx, http.MethodGet, "http://"+listen+"/admin/healthz", nil) + req, reqErr := http.NewRequestWithContext(ctx, http.MethodGet, "http://"+addr+"/admin/healthz", nil) require.NoError(t, reqErr) resp, err = client.Do(req) if err == nil { @@ -167,12 +164,10 @@ func TestStartAdminServer_ServesTLS(t *testing.T) { _ = eg.Wait() }() - port := freePort(t) - listen := "127.0.0.1:" + port var lc net.ListenConfig cfg := adminListenerConfig{ enabled: true, - listen: listen, + listen: "127.0.0.1:0", tlsCertFile: certFile, tlsKeyFile: keyFile, sessionSigningKey: freshKey(), @@ -180,15 +175,15 @@ func TestStartAdminServer_ServesTLS(t *testing.T) { cluster := admin.ClusterInfoFunc(func(_ context.Context) (admin.ClusterInfo, error) { return admin.ClusterInfo{NodeID: "n-tls", Version: "test"}, nil }) - require.NoError(t, startAdminServer(eCtx, &lc, eg, cfg, map[string]string{}, cluster, "test")) + addr, err := startAdminServer(eCtx, &lc, eg, cfg, map[string]string{}, cluster, "test") + require.NoError(t, err) transport := &http.Transport{TLSClientConfig: &tls.Config{InsecureSkipVerify: true}} //nolint:gosec // self-signed in test client := &http.Client{Transport: transport, Timeout: 2 * time.Second} deadline := time.Now().Add(3 * time.Second) var resp *http.Response - var err error for time.Now().Before(deadline) { - req, reqErr := http.NewRequestWithContext(ctx, http.MethodGet, "https://"+listen+"/admin/healthz", nil) + req, reqErr := http.NewRequestWithContext(ctx, http.MethodGet, "https://"+addr+"/admin/healthz", nil) require.NoError(t, reqErr) resp, err = client.Do(req) if err == nil { @@ -201,25 +196,6 @@ func TestStartAdminServer_ServesTLS(t *testing.T) { _ = resp.Body.Close() } -func freePort(t *testing.T) string { - t.Helper() - var lc net.ListenConfig - l, err := lc.Listen(context.Background(), "tcp", "127.0.0.1:0") - require.NoError(t, err) - addr, ok := l.Addr().(*net.TCPAddr) - require.True(t, ok, "listener did not produce a *net.TCPAddr") - _ = l.Close() - return strings.TrimSpace(filepathBase(addr.String())) -} - -func filepathBase(hostPort string) string { - i := strings.LastIndex(hostPort, ":") - if i < 0 { - return hostPort - } - return hostPort[i+1:] -} - // writeSelfSignedCert writes a short-lived self-signed certificate to a // temp dir and returns the cert / key paths. The certificate is valid // for 127.0.0.1 only; tests that need TLS should run with it. From 4a3e8b699b5411a71b426c317764ef914b671656 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Sat, 25 Apr 2026 03:45:10 +0900 Subject: [PATCH 09/15] admin: reject typed-nil CredentialStore (Codex P1) - NewServer: `deps.Credentials == nil` only caught untyped nil and silently admitted a typed-nil interface such as `MapCredentialStore(nil)`. A caller that wrapped a nil map (as main_admin did when --s3CredentialsFile was unset) would then start the admin listener and reject every login with "invalid_credentials" at runtime. Add a reflection-based isNilCredentialStore check that also detects the typed-nil cases for every nilable reference kind. An empty-but-non-nil map is still accepted here because the wiring layer has context to judge whether that is intentional. - main_admin.startAdminFromFlags: after loading the S3 credentials file, refuse to start when the resulting map is empty. Running admin with zero configured access keys is not a valid lockdown posture; it would simply drop every login. Together with the server-side guard this closes the lockout loophole Codex called out. - Tests: dedicated table that covers typed-nil and empty-map cases of the new guard. --- internal/admin/server.go | 37 ++++++++++++++++++- internal/admin/server_nil_creds_test.go | 49 +++++++++++++++++++++++++ main_admin.go | 11 ++++++ 3 files changed, 96 insertions(+), 1 deletion(-) create mode 100644 internal/admin/server_nil_creds_test.go diff --git a/internal/admin/server.go b/internal/admin/server.go index 80c7f4a5..95c76620 100644 --- a/internal/admin/server.go +++ b/internal/admin/server.go @@ -5,6 +5,7 @@ import ( "io/fs" "log/slog" "net/http" + "reflect" "strings" ) @@ -65,7 +66,7 @@ func NewServer(deps ServerDeps) (*Server, error) { if deps.Verifier == nil { return nil, errMissing("Verifier") } - if deps.Credentials == nil { + if isNilCredentialStore(deps.Credentials) { return nil, errMissing("Credentials") } if deps.Roles == nil { @@ -187,6 +188,40 @@ func (e *missingDepError) Error() string { return "admin.NewServer: required dependency missing: " + e.field } +// isNilCredentialStore treats both untyped-nil and typed-nil reference +// values (e.g. `MapCredentialStore(nil)`) as missing. Without the typed +// check, a caller that wraps a nil map in our CredentialStore type slips +// past a plain `== nil` guard and the admin listener silently rejects +// every login with "invalid_credentials" at runtime. +func isNilCredentialStore(cs CredentialStore) bool { + if cs == nil { + return true + } + v := reflect.ValueOf(cs) + if isNilableKind(v.Kind()) { + return v.IsNil() + } + return false +} + +// nilableKinds is the set of reflect.Kind values that can legitimately +// hold a nil value. Keeping it as a package-level lookup (rather than a +// switch) lets the exhaustive linter see that we are intentionally +// matching a small allow-list rather than forgetting a case. +var nilableKinds = map[reflect.Kind]struct{}{ + reflect.Map: {}, + reflect.Ptr: {}, + reflect.Slice: {}, + reflect.Chan: {}, + reflect.Func: {}, + reflect.Interface: {}, +} + +func isNilableKind(k reflect.Kind) bool { + _, ok := nilableKinds[k] + return ok +} + // endpointMatches is a small helper kept here for readability in table // driven path-routing tests. func endpointMatches(path, want string) bool { diff --git a/internal/admin/server_nil_creds_test.go b/internal/admin/server_nil_creds_test.go new file mode 100644 index 00000000..b1b03f12 --- /dev/null +++ b/internal/admin/server_nil_creds_test.go @@ -0,0 +1,49 @@ +package admin + +import ( + "context" + "testing" + "time" + + "github.com/stretchr/testify/require" +) + +// TestServer_RejectsTypedNilCredentialStore ensures that a caller that +// wraps a nil map in MapCredentialStore cannot start the admin +// listener. Without this guard, `== nil` on the CredentialStore +// interface would miss a typed-nil, and every login would silently +// 401 at runtime. +func TestServer_RejectsTypedNilCredentialStore(t *testing.T) { + clk := fixedClock(time.Unix(1_700_000_000, 0).UTC()) + signer := newSignerForTest(t, 1, clk) + verifier := newVerifierForTest(t, []byte{1}, clk) + cluster := ClusterInfoFunc(func(_ context.Context) (ClusterInfo, error) { + return ClusterInfo{NodeID: "n"}, nil + }) + + var nilMap MapCredentialStore // typed nil + _, err := NewServer(ServerDeps{ + Signer: signer, + Verifier: verifier, + Credentials: nilMap, + Roles: map[string]Role{"AKIA": RoleFull}, + ClusterInfo: cluster, + }) + require.Error(t, err) + require.Contains(t, err.Error(), "Credentials") +} + +func TestIsNilCredentialStore(t *testing.T) { + require.True(t, isNilCredentialStore(nil)) + require.True(t, isNilCredentialStore(MapCredentialStore(nil))) + + var noMap MapCredentialStore + require.True(t, isNilCredentialStore(noMap)) + + require.False(t, isNilCredentialStore(MapCredentialStore{"AKIA": "x"})) + // An empty (but non-nil) map is not considered nil — it is a valid + // "no credentials configured, reject all logins" posture; the + // wiring layer (main_admin.go) is responsible for refusing that + // shape when admin is enabled. + require.False(t, isNilCredentialStore(MapCredentialStore{})) +} diff --git a/main_admin.go b/main_admin.go index 650c4a25..a3781b88 100644 --- a/main_admin.go +++ b/main_admin.go @@ -64,6 +64,17 @@ func startAdminFromFlags(ctx context.Context, lc *net.ListenConfig, eg *errgroup if err != nil { return errors.Wrapf(err, "load static credentials for admin listener") } + // An admin listener with zero credentials would accept logins + // only to reject every one of them with invalid_credentials, so a + // missing or empty credentials file is a wiring bug rather than a + // valid "locked down" state. Failing fast here also guards against + // the typed-nil MapCredentialStore case inside NewServer (an + // untyped `== nil` check cannot detect a nil-map-valued interface + // on its own). + if len(staticCreds) == 0 { + return errors.New("admin listener is enabled but no static credentials are configured;" + + " set -s3CredentialsFile to a file with at least one entry,") + } cfg := adminListenerConfig{ enabled: *adminEnabled, listen: *adminListen, From a41b9e27a63d4d6ce112f9be3563ba8893b6298a Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Sat, 25 Apr 2026 03:59:31 +0900 Subject: [PATCH 10/15] admin: address Codex round-2 review (2 P2 findings) - router: widen the API-namespace guard from /admin/api/v1 to the whole /admin/api* prefix. Requests like /admin/api, /admin/api/v2, or /admin/api/v2/tables now return a JSON 404 instead of falling through to the SPA fallback and being served with index.html. Split ServeHTTP into classify/dispatch with routeKind to keep the logic under the cyclomatic complexity ceiling, and expanded the dedicated JSON-only regression test to cover /admin/api and the v2 shapes. - main_admin.newClusterInfoSource: initialise GroupInfo.Members to an empty, non-nil slice before the best-effort Configuration() call so a transient raft state serialises as `"members": []` rather than `"members": null`. The always-array contract matches how the rest of the field is consumed. --- internal/admin/router.go | 99 ++++++++++++++++++++++++++--------- internal/admin/router_test.go | 14 ++++- main_admin.go | 6 ++- 3 files changed, 91 insertions(+), 28 deletions(-) diff --git a/internal/admin/router.go b/internal/admin/router.go index d525f50b..8ffcca01 100644 --- a/internal/admin/router.go +++ b/internal/admin/router.go @@ -18,8 +18,15 @@ import ( // directory itself so that requests like `/admin/api/v1` or // `/admin/assets` resolve to a JSON 404 rather than falling through to // the SPA fallback and being answered with index.html. +// +// pathAPIRoot / pathPrefixAPI guard the whole `/admin/api*` namespace — +// not just v1 — so that requests to currently-unimplemented API +// versions (`/admin/api`, `/admin/api/v2`, ...) return a JSON 404 +// instead of being silently answered with the SPA HTML. const ( pathPrefixAdmin = "/admin" + pathAPIRoot = "/admin/api" + pathPrefixAPI = pathAPIRoot + "/" pathAPIv1Root = "/admin/api/v1" pathPrefixAPIv1 = pathAPIv1Root + "/" pathHealthz = "/admin/healthz" @@ -68,38 +75,78 @@ func NewRouter(api http.Handler, static fs.FS) *Router { // 4. /admin/* → index.html (SPA fallback) // 5. anything else → 404 JSON func (rt *Router) ServeHTTP(w http.ResponseWriter, r *http.Request) { - p := r.URL.Path + rt.dispatch(rt.classify(r.URL.Path)).ServeHTTP(w, r) +} + +// routeKind enumerates the admin URL classes the router distinguishes. +// Splitting classify/dispatch keeps ServeHTTP under the cyclomatic +// complexity ceiling while preserving the strict evaluation order that +// API-before-SPA routing depends on. +type routeKind int +const ( + routeAPIv1 routeKind = iota + routeAPIOther + routeHealthz + routeAssetsRoot + routeAsset + routeSPA + routeUnknown +) + +func (rt *Router) classify(p string) routeKind { + if k, ok := classifyAPI(p); ok { + return k + } + if k, ok := classifyAssets(p); ok { + return k + } + if p == pathHealthz { + return routeHealthz + } + if p == pathPrefixAdmin || strings.HasPrefix(p, pathPrefixAdmin+"/") { + return routeSPA + } + return routeUnknown +} + +func classifyAPI(p string) (routeKind, bool) { switch { - case p == pathAPIv1Root: - // Bare /admin/api/v1 is an API root, not an SPA page: answer - // with a JSON 404 so clients never get HTML back on an API - // path. - rt.notFind.ServeHTTP(w, r) - return case strings.HasPrefix(p, pathPrefixAPIv1): - if rt.api == nil { - rt.notFind.ServeHTTP(w, r) - return - } - rt.api.ServeHTTP(w, r) - return - case p == pathHealthz: - rt.serveHealth(w, r) - return + return routeAPIv1, true + case p == pathAPIRoot, p == pathAPIv1Root, strings.HasPrefix(p, pathPrefixAPI): + return routeAPIOther, true + } + return 0, false +} + +func classifyAssets(p string) (routeKind, bool) { + switch { case p == pathAssetsRoot: - // Same reasoning as /admin/api/v1: the bare assets root is - // a directory, not an SPA route. - rt.notFind.ServeHTTP(w, r) - return + return routeAssetsRoot, true case strings.HasPrefix(p, pathPrefixAssets): - rt.serveAsset(w, r) - return - case p == pathPrefixAdmin || strings.HasPrefix(p, pathPrefixAdmin+"/"): - rt.serveSPA(w, r) - return + return routeAsset, true + } + return 0, false +} + +func (rt *Router) dispatch(k routeKind) http.Handler { + switch k { + case routeAPIv1: + if rt.api == nil { + return rt.notFind + } + return rt.api + case routeHealthz: + return http.HandlerFunc(rt.serveHealth) + case routeAsset: + return http.HandlerFunc(rt.serveAsset) + case routeSPA: + return http.HandlerFunc(rt.serveSPA) + case routeAPIOther, routeAssetsRoot, routeUnknown: + return rt.notFind } - rt.notFind.ServeHTTP(w, r) + return rt.notFind } func (rt *Router) serveHealth(w http.ResponseWriter, r *http.Request) { diff --git a/internal/admin/router_test.go b/internal/admin/router_test.go index b6d889bc..75e8d813 100644 --- a/internal/admin/router_test.go +++ b/internal/admin/router_test.go @@ -148,7 +148,19 @@ func TestRouter_BareAPIRootReturnsJSON404NotHTML(t *testing.T) { r := NewRouter(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { w.WriteHeader(http.StatusNoContent) }), newTestStatic()) - for _, p := range []string{"/admin/api/v1", "/admin/assets"} { + // /admin/api, /admin/api/v1 (bare), /admin/api/v2 (unimplemented), + // /admin/api/v2/foo (deeper under unknown version), and the bare + // /admin/assets directory must all resolve to a JSON 404 — never + // the SPA HTML fallback — so API clients and probes get a + // machine-readable answer. + paths := []string{ + "/admin/api", + "/admin/api/v1", + "/admin/api/v2", + "/admin/api/v2/tables", + "/admin/assets", + } + for _, p := range paths { req := httptest.NewRequest(http.MethodGet, p, nil) rec := httptest.NewRecorder() r.ServeHTTP(rec, req) diff --git a/main_admin.go b/main_admin.go index a3781b88..7eeaf2aa 100644 --- a/main_admin.go +++ b/main_admin.go @@ -266,7 +266,11 @@ func newClusterInfoSource(nodeID, version string, runtimes []*raftGroupRuntime) continue } status := rt.engine.Status() - var members []string + // Seed as an empty-but-non-nil slice so a + // Configuration() failure still JSON-encodes as `[]` + // rather than `null`; API consumers that treat + // members as an always-array field rely on this. + members := []string{} if cfg, err := rt.engine.Configuration(ctx); err == nil { members = make([]string, 0, len(cfg.Servers)) for _, srv := range cfg.Servers { From a4297f8bc560aa99b8ce2bec7320c3b4c1a36254 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Sat, 25 Apr 2026 04:28:12 +0900 Subject: [PATCH 11/15] admin: protect logout with SessionAuth + CSRF (Codex P2) - server.buildAPIMux: route /admin/api/v1/auth/logout through the protected chain instead of publicAuth. The old wiring would accept a cross-site POST to /auth/logout and emit expired Set-Cookie headers unconditionally, giving an attacker a practical logout-CSRF path: a victim visiting a hostile site could be force-logged-out. Requiring both SessionAuth and the CSRF double-submit makes the route idempotent for legitimate callers and unreachable across origins (SameSite=Strict strips the cookies on cross-site POSTs, and the server rejects anything that lacks both the session cookie and the matching X-Admin-CSRF header). - New tests fix the behaviour: * Logout without a session returns 401 and emits no cookies. * Logout with a session but no CSRF header returns 403. * Logout with session + matching CSRF header returns 204 and emits both expired Set-Cookie headers so the browser actually forgets the session. --- internal/admin/logout_csrf_test.go | 100 +++++++++++++++++++++++++++++ internal/admin/server.go | 12 +++- 2 files changed, 109 insertions(+), 3 deletions(-) create mode 100644 internal/admin/logout_csrf_test.go diff --git a/internal/admin/logout_csrf_test.go b/internal/admin/logout_csrf_test.go new file mode 100644 index 00000000..de608333 --- /dev/null +++ b/internal/admin/logout_csrf_test.go @@ -0,0 +1,100 @@ +package admin + +import ( + "bytes" + "net/http" + "net/http/httptest" + "testing" + + "github.com/stretchr/testify/require" +) + +// TestLogout_RejectsUnauthenticated ensures a cross-site caller cannot +// POST /admin/api/v1/auth/logout without a valid session, which was the +// logout-CSRF vector Codex flagged. +func TestLogout_RejectsUnauthenticated(t *testing.T) { + srv := newServerForTest(t) + req := httptest.NewRequest(http.MethodPost, "/admin/api/v1/auth/logout", nil) + rec := httptest.NewRecorder() + srv.Handler().ServeHTTP(rec, req) + require.Equal(t, http.StatusUnauthorized, rec.Code) + // The server must not have set any cookies on a rejected logout. + require.Empty(t, rec.Result().Cookies()) +} + +// TestLogout_RequiresCSRF ensures that even with a valid session cookie, +// logout refuses to execute without a matching X-Admin-CSRF header. +// SameSite=Strict already blocks the cross-site leg, but the server-side +// CSRF check is an explicit belt-and-braces guard. +func TestLogout_RequiresCSRF(t *testing.T) { + srv := newServerForTest(t) + ts := httptest.NewServer(srv.Handler()) + defer ts.Close() + + // Log in to collect cookies. + cookies := loginForTest(t, ts) + + // POST /auth/logout with session cookie but NO X-Admin-CSRF header. + req := httptest.NewRequest(http.MethodPost, "/admin/api/v1/auth/logout", nil) + for _, c := range cookies { + req.AddCookie(c) + } + rec := httptest.NewRecorder() + srv.Handler().ServeHTTP(rec, req) + require.Equal(t, http.StatusForbidden, rec.Code) + require.Contains(t, rec.Body.String(), "csrf_missing") +} + +// TestLogout_HappyPath verifies that a well-formed logout (session +// cookie + matching CSRF header + cookie) succeeds and returns 204. +func TestLogout_HappyPath(t *testing.T) { + srv := newServerForTest(t) + ts := httptest.NewServer(srv.Handler()) + defer ts.Close() + + cookies := loginForTest(t, ts) + var csrfValue string + for _, c := range cookies { + if c.Name == csrfCookieName { + csrfValue = c.Value + } + } + require.NotEmpty(t, csrfValue) + + req := httptest.NewRequest(http.MethodPost, "/admin/api/v1/auth/logout", nil) + for _, c := range cookies { + req.AddCookie(c) + } + req.Header.Set(csrfHeaderName, csrfValue) + rec := httptest.NewRecorder() + srv.Handler().ServeHTTP(rec, req) + require.Equal(t, http.StatusNoContent, rec.Code) + + // Expired-cookie Set-Cookie headers must still be emitted so the + // client actually forgets the session. + var cleared int + for _, c := range rec.Result().Cookies() { + if c.MaxAge == -1 { + cleared++ + } + } + require.Equal(t, 2, cleared, "expected both admin_session and admin_csrf to be cleared") +} + +// loginForTest POSTs a valid login against ts and returns the resulting +// cookies. It is a small helper shared by the logout tests above. +func loginForTest(t *testing.T, ts *httptest.Server) []*http.Cookie { + t.Helper() + body := []byte(`{"access_key":"AKIA_ADMIN","secret_key":"ADMIN_SECRET"}`) + req, err := http.NewRequestWithContext(t.Context(), http.MethodPost, + ts.URL+"/admin/api/v1/auth/login", bytes.NewReader(body)) + require.NoError(t, err) + req.Header.Set("Content-Type", "application/json") + resp, err := http.DefaultClient.Do(req) + require.NoError(t, err) + defer resp.Body.Close() + require.Equal(t, http.StatusOK, resp.StatusCode) + cookies := resp.Cookies() + require.Len(t, cookies, 2) + return cookies +} diff --git a/internal/admin/server.go b/internal/admin/server.go index 95c76620..5a64a452 100644 --- a/internal/admin/server.go +++ b/internal/admin/server.go @@ -146,8 +146,14 @@ func buildAPIMux(auth *AuthService, verifier *Verifier, clusterHandler http.Hand ), ) } - // Login / logout: body limit only. Audit is handled inside the - // AuthService so the actor field is populated correctly. + // Login is the only endpoint that runs without a pre-existing + // session — every other write must go through session + CSRF so + // a cross-site page cannot force a user to perform any state + // change against their will. Notably, /auth/logout goes through + // the protected chain to prevent logout-CSRF: a hostile site that + // POSTed to /auth/logout would otherwise be able to force-clear a + // victim's cookies even with SameSite=Strict sessions, because + // HandleLogout always emits expired Set-Cookie headers. publicAuth := func(next http.Handler) http.Handler { return BodyLimit(defaultBodyLimit)(next) } @@ -157,7 +163,7 @@ func buildAPIMux(auth *AuthService, verifier *Verifier, clusterHandler http.Hand case "/admin/api/v1/auth/login": publicAuth(loginHandler).ServeHTTP(w, r) case "/admin/api/v1/auth/logout": - publicAuth(logoutHandler).ServeHTTP(w, r) + protect(logoutHandler).ServeHTTP(w, r) case "/admin/api/v1/cluster": protect(clusterHandler).ServeHTTP(w, r) default: From 86f4406315b9bfcd19273c8f845448d437b0cb3c Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Sat, 25 Apr 2026 05:47:15 +0900 Subject: [PATCH 12/15] admin: drop dummy io.Discard keeper in router_test (Copilot) router_test.go previously kept `io` imported solely for a `_ = io.Discard` placeholder at the end of a test. Remove the placeholder and the now-unused import; Go's unused-import rule will flag real drift from here on. --- internal/admin/router_test.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/internal/admin/router_test.go b/internal/admin/router_test.go index 75e8d813..12515b64 100644 --- a/internal/admin/router_test.go +++ b/internal/admin/router_test.go @@ -1,7 +1,6 @@ package admin import ( - "io" "net/http" "net/http/httptest" "strings" @@ -191,5 +190,4 @@ func TestRouter_APIHandlerReceivesFullPath(t *testing.T) { rec := httptest.NewRecorder() r.ServeHTTP(rec, req) require.Equal(t, "/admin/api/v1/dynamo/tables", received) - _ = io.Discard } From 884fd7d0c6bce2c7ccc27f0062decf414b146962 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Sat, 25 Apr 2026 06:09:16 +0900 Subject: [PATCH 13/15] admin: address CodeRabbit round-1 review (15 findings) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1 actionable + 14 nitpicks: - config.decodeSigningKey: accept URL-safe base64 (RawURLEncoding / URLEncoding) in addition to std base64, so operators who paste a key from a Kubernetes Secret do not hit a spurious format error. - jwt.validateClaims: treat a missing iat the same as a missing exp. The admin Signer always sets iat, so a token without one is malformed, foreign, or from a future version we should not trust. - auth_handler: materialise unknownKeyPlaceholder inside initSecretCompareKey so the login failure path does not run HMAC-SHA256 from scratch on every bogus access key. - cluster_handler: log JSON encode failures via slog.Warn. The 200 header is already on the wire so we cannot change the status, but a broken-pipe or encoder error was previously silent. - router: use the standard io.ReadSeeker instead of an anonymous Read+Seek interface (twice). Also move the method check in serveSPA above the rt.static == nil branch so the response is uniform across binaries with and without an embedded SPA. - router_test: tighten the traversal test to Equal 404 + JSON content-type; NotEqual(200) was too loose (would pass on 500). - config.DecodedSigningKeys: fail loudly when called on a disabled config or when the primary/previous key decodes to zero bytes, instead of returning [][]byte{nil} and feeding a nil HMAC key into the verifier. - server.buildAPIMux: hoist the protect / publicAuth chains to startup so we do not rebuild the 4-5 middleware closures on every inbound request. - server: delete the endpointMatches and ctxClosed helpers (and the TestInternalHelpers wrapper) — they were test-only and endpointMatches used strings.EqualFold, which misrepresented the case-sensitive real routing. - server_nil_creds_test: convert TestIsNilCredentialStore to a table-driven form per project guidelines. - main.go: add -adminSessionSigningKeyFile / -adminSessionSigningKeyPreviousFile and the companion ELASTICKV_ADMIN_SESSION_SIGNING_KEY[_PREVIOUS] env vars, so the secret no longer has to sit in /proc//cmdline on Linux. startAdminFromFlags prefers file > env > argv. - main_admin.newAdminHTTPServer: drop the unused adminCfg parameter; the "reserved for future use" argument was YAGNI. - main_admin_test: pin tls.VersionTLS12 on the test HTTP client's TLS config so gosec's missing-ssl-minversion rule is satisfied. Tests: table-driven TestIsNilCredentialStore, tightened traversal assertion, double-slash JSON-404 case, and existing coverage around auth/router/server all green. golangci-lint --new-from-rev=origin/main: 0 issues. --- internal/admin/auth_handler.go | 26 +++++------- internal/admin/cluster_handler.go | 9 +++- internal/admin/config.go | 56 +++++++++++++++++++------ internal/admin/jwt.go | 9 +++- internal/admin/router.go | 24 +++++------ internal/admin/router_test.go | 8 ++-- internal/admin/server.go | 31 ++++---------- internal/admin/server_nil_creds_test.go | 32 +++++++++----- internal/admin/server_test.go | 11 ----- main.go | 22 +++++----- main_admin.go | 46 +++++++++++++++++--- main_admin_test.go | 5 ++- 12 files changed, 174 insertions(+), 105 deletions(-) diff --git a/internal/admin/auth_handler.go b/internal/admin/auth_handler.go index dd5173d0..a1aa9ad7 100644 --- a/internal/admin/auth_handler.go +++ b/internal/admin/auth_handler.go @@ -234,8 +234,9 @@ func readLoginRequest(w http.ResponseWriter, r *http.Request) (loginRequest, boo // expensive KDF would add latency to every login attempt without // changing the threat model. var ( - secretCompareKey []byte - secretCompareKeyOnce sync.Once + secretCompareKey []byte + secretCompareKeyOnce sync.Once + unknownKeyPlaceholder []byte ) func initSecretCompareKey() { @@ -246,6 +247,12 @@ func initSecretCompareKey() { // cannot authenticate anyone anyway. panic("admin: crypto/rand failure while initialising secret compare key: " + err.Error()) } + // Materialise the unknown-key-placeholder digest exactly once so + // the login failure hot path does not re-run HMAC on every try; + // the value is deterministic given secretCompareKey. + mac := hmac.New(sha256.New, secretCompareKey) + mac.Write([]byte("admin-auth-unknown-key-placeholder")) + unknownKeyPlaceholder = mac.Sum(nil) } // digestForCompare returns HMAC-SHA256(secretCompareKey, s). Used only @@ -257,22 +264,11 @@ func digestForCompare(s string) []byte { return mac.Sum(nil) } -// unknownKeyPlaceholder is a sentinel digest we compare against when the -// caller-supplied access key is not in the credential store. Using a -// deterministic value here keeps the work done by the authenticate path -// roughly equivalent between "unknown key" and "known key, wrong -// secret", so the two branches are not distinguishable by timing. -var unknownKeyPlaceholder = func() []byte { - secretCompareKeyOnce.Do(initSecretCompareKey) - mac := hmac.New(sha256.New, secretCompareKey) - mac.Write([]byte("admin-auth-unknown-key-placeholder")) - return mac.Sum(nil) -} - func (s *AuthService) authenticate(w http.ResponseWriter, req loginRequest) (AuthPrincipal, bool) { + secretCompareKeyOnce.Do(initSecretCompareKey) providedHash := digestForCompare(req.SecretKey) expected, known := s.creds.LookupSecret(req.AccessKey) - expectedHash := unknownKeyPlaceholder() + expectedHash := unknownKeyPlaceholder if known { expectedHash = digestForCompare(expected) } diff --git a/internal/admin/cluster_handler.go b/internal/admin/cluster_handler.go index 2b5d22ef..81b421e1 100644 --- a/internal/admin/cluster_handler.go +++ b/internal/admin/cluster_handler.go @@ -92,5 +92,12 @@ func (h *ClusterHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "application/json; charset=utf-8") w.Header().Set("Cache-Control", "no-store") w.WriteHeader(http.StatusOK) - _ = json.NewEncoder(w).Encode(info) + if err := json.NewEncoder(w).Encode(info); err != nil { + // The 200 header is already on the wire, so we cannot + // change the status — but a truncated JSON body is hard + // to diagnose without a breadcrumb. + h.logger.LogAttrs(r.Context(), slog.LevelWarn, "admin cluster response encode failed", + slog.String("error", err.Error()), + ) + } } diff --git a/internal/admin/config.go b/internal/admin/config.go index 8742681f..241a68d0 100644 --- a/internal/admin/config.go +++ b/internal/admin/config.go @@ -103,6 +103,9 @@ func (c *Config) validateTLS() error { if tlsConfigured || !addressRequiresTLS(strings.TrimSpace(c.Listen)) || c.AllowPlaintextNonLoopback { return nil } + // errors.Newf already carries a stack; errors.WithStack here is + // for the project's wrapcheck linter, which requires every + // cockroachdb/errors return at a package boundary to be wrapped. return errors.WithStack(errors.Newf( "-adminListen %q is not loopback but TLS is not configured;"+ " set -adminTLSCertFile + -adminTLSKeyFile, or explicitly pass"+ @@ -130,21 +133,32 @@ func (c *Config) validateSigningKeys() error { // DecodedSigningKeys returns the raw HS256 keys in verification order: the // primary signing key first, followed by an optional previous key. Validate -// must be called first. +// must be called first; this method also asserts that contract defensively +// so a missing key cannot quietly produce a `[][]byte{nil}` result and feed +// a nil HMAC key into the verifier. func (c *Config) DecodedSigningKeys() ([][]byte, error) { + if c == nil || !c.Enabled { + return nil, errors.New("DecodedSigningKeys called on a disabled admin config") + } primary, err := decodeSigningKey("-adminSessionSigningKey", c.SessionSigningKey) if err != nil { return nil, err } + if len(primary) == 0 { + return nil, errors.New("-adminSessionSigningKey is empty; call Validate first") + } keys := [][]byte{primary} - if strings.TrimSpace(c.SessionSigningKeyPrevious) != "" { - prev, err := decodeSigningKey("-adminSessionSigningKeyPrevious", c.SessionSigningKeyPrevious) - if err != nil { - return nil, err - } - keys = append(keys, prev) + if strings.TrimSpace(c.SessionSigningKeyPrevious) == "" { + return keys, nil + } + prev, err := decodeSigningKey("-adminSessionSigningKeyPrevious", c.SessionSigningKeyPrevious) + if err != nil { + return nil, err + } + if len(prev) == 0 { + return nil, errors.New("-adminSessionSigningKeyPrevious is set but decoded to zero bytes") } - return keys, nil + return append(keys, prev), nil } // RoleIndex returns a map from access key to Role after Validate has @@ -175,13 +189,29 @@ func decodeSigningKey(field, encoded string) ([]byte, error) { if trim == "" { return nil, nil } - raw, err := base64.StdEncoding.DecodeString(trim) - if err != nil { - raw, err = base64.RawStdEncoding.DecodeString(trim) - if err != nil { - return nil, errors.Wrapf(err, "%s is not valid base64", field) + // Operators routinely copy keys out of Kubernetes Secrets or + // similar tooling that may emit either standard or URL-safe + // base64, with or without padding. Try each alphabet in turn so + // a correct key is never rejected over a formatting mismatch. + decoders := []*base64.Encoding{ + base64.StdEncoding, + base64.RawStdEncoding, + base64.URLEncoding, + base64.RawURLEncoding, + } + var ( + raw []byte + decErr error + ) + for _, dec := range decoders { + raw, decErr = dec.DecodeString(trim) + if decErr == nil { + break } } + if decErr != nil { + return nil, errors.Wrapf(decErr, "%s is not valid base64", field) + } if len(raw) != sessionSigningKeyLen { return nil, errors.WithStack(errors.Newf( "%s must decode to %d bytes but got %d bytes", diff --git a/internal/admin/jwt.go b/internal/admin/jwt.go index c27a7ce5..2b34c701 100644 --- a/internal/admin/jwt.go +++ b/internal/admin/jwt.go @@ -186,7 +186,14 @@ func (v *Verifier) validateClaims(claims jwtClaims) (AuthPrincipal, error) { if claims.EXP == 0 || now >= claims.EXP { return AuthPrincipal{}, errors.Wrap(ErrInvalidToken, "token expired") } - if claims.IAT != 0 && now+clockSkewToleranceSeconds < claims.IAT { + // A missing iat is treated the same as a missing exp: the admin + // Signer always sets iat, so a token without one is either + // malformed, produced by a foreign signer that happens to share + // the HS256 key, or a future-version token we do not recognise. + if claims.IAT == 0 { + return AuthPrincipal{}, errors.Wrap(ErrInvalidToken, "missing iat") + } + if now+clockSkewToleranceSeconds < claims.IAT { return AuthPrincipal{}, errors.Wrap(ErrInvalidToken, "token issued in the future") } if claims.Sub == "" { diff --git a/internal/admin/router.go b/internal/admin/router.go index 8ffcca01..afb36ac3 100644 --- a/internal/admin/router.go +++ b/internal/admin/router.go @@ -2,6 +2,7 @@ package admin import ( "errors" + "io" "io/fs" "net/http" "path" @@ -196,10 +197,7 @@ func (rt *Router) serveAsset(w http.ResponseWriter, r *http.Request) { rt.notFind.ServeHTTP(w, r) return } - readSeeker, ok := f.(interface { - Read(p []byte) (int, error) - Seek(offset int64, whence int) (int64, error) - }) + readSeeker, ok := f.(io.ReadSeeker) if !ok { // embed.FS files implement ReadSeeker, but be defensive. writeJSONError(w, http.StatusInternalServerError, "internal", "asset is not seekable") @@ -209,14 +207,19 @@ func (rt *Router) serveAsset(w http.ResponseWriter, r *http.Request) { } func (rt *Router) serveSPA(w http.ResponseWriter, r *http.Request) { - if rt.static == nil { - rt.notFind.ServeHTTP(w, r) - return - } + // Reject non-GET/HEAD methods before inspecting rt.static so the + // response is uniform across admin binaries whether or not the + // SPA bundle happens to be configured. Without this, a POST to + // /admin/something returned a JSON 404 with a nil static and a + // JSON 405 with a populated static — same URL, different answer. if r.Method != http.MethodGet && r.Method != http.MethodHead { writeJSONError(w, http.StatusMethodNotAllowed, "method_not_allowed", "only GET or HEAD supported") return } + if rt.static == nil { + rt.notFind.ServeHTTP(w, r) + return + } f, err := rt.static.Open(pathIndexHTML) if err != nil { rt.notFind.ServeHTTP(w, r) @@ -228,10 +231,7 @@ func (rt *Router) serveSPA(w http.ResponseWriter, r *http.Request) { rt.notFind.ServeHTTP(w, r) return } - readSeeker, ok := f.(interface { - Read(p []byte) (int, error) - Seek(offset int64, whence int) (int64, error) - }) + readSeeker, ok := f.(io.ReadSeeker) if !ok { writeJSONError(w, http.StatusInternalServerError, "internal", "index.html is not seekable") return diff --git a/internal/admin/router_test.go b/internal/admin/router_test.go index 12515b64..2868602e 100644 --- a/internal/admin/router_test.go +++ b/internal/admin/router_test.go @@ -91,9 +91,11 @@ func TestRouter_StaticTraversalRejected(t *testing.T) { rec := httptest.NewRecorder() r.ServeHTTP(rec, req) - // Requested path cleaned by net/http before reaching us is still - // under /admin/ — defence in depth: we reject any ".." segment. - require.NotEqual(t, http.StatusOK, rec.Code) + // A ".." path segment must resolve to the exact JSON 404 — any + // other response (302/400/500) would hide a regression in the + // validation path (e.g. the file opening and returning a 500). + require.Equal(t, http.StatusNotFound, rec.Code) + require.Contains(t, rec.Header().Get("Content-Type"), "application/json") } // Filenames containing ".." as a substring (but not as a path segment) diff --git a/internal/admin/server.go b/internal/admin/server.go index 5a64a452..b91bc33f 100644 --- a/internal/admin/server.go +++ b/internal/admin/server.go @@ -1,12 +1,10 @@ package admin import ( - "context" "io/fs" "log/slog" "net/http" "reflect" - "strings" ) // ServerDeps bundles the collaborators the admin HTTP surface needs. All @@ -158,14 +156,20 @@ func buildAPIMux(auth *AuthService, verifier *Verifier, clusterHandler http.Hand return BodyLimit(defaultBodyLimit)(next) } + // Build each route's middleware stack exactly once at startup + // rather than rebuilding 4–5 closures per inbound request. + loginChain := publicAuth(loginHandler) + logoutChain := protect(logoutHandler) + clusterChain := protect(clusterHandler) + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { switch r.URL.Path { case "/admin/api/v1/auth/login": - publicAuth(loginHandler).ServeHTTP(w, r) + loginChain.ServeHTTP(w, r) case "/admin/api/v1/auth/logout": - protect(logoutHandler).ServeHTTP(w, r) + logoutChain.ServeHTTP(w, r) case "/admin/api/v1/cluster": - protect(clusterHandler).ServeHTTP(w, r) + clusterChain.ServeHTTP(w, r) default: writeJSONError(w, http.StatusNotFound, "unknown_endpoint", "no admin API handler is registered for this path") @@ -173,17 +177,6 @@ func buildAPIMux(auth *AuthService, verifier *Verifier, clusterHandler http.Hand }) } -// ctxClosed is a small helper for shutdown signalling; kept here so -// tests do not need to import context directly. -func ctxClosed(ctx context.Context) bool { - select { - case <-ctx.Done(): - return true - default: - return false - } -} - func errMissing(field string) error { return &missingDepError{field: field} } @@ -227,9 +220,3 @@ func isNilableKind(k reflect.Kind) bool { _, ok := nilableKinds[k] return ok } - -// endpointMatches is a small helper kept here for readability in table -// driven path-routing tests. -func endpointMatches(path, want string) bool { - return strings.EqualFold(path, want) -} diff --git a/internal/admin/server_nil_creds_test.go b/internal/admin/server_nil_creds_test.go index b1b03f12..74708b24 100644 --- a/internal/admin/server_nil_creds_test.go +++ b/internal/admin/server_nil_creds_test.go @@ -34,16 +34,26 @@ func TestServer_RejectsTypedNilCredentialStore(t *testing.T) { } func TestIsNilCredentialStore(t *testing.T) { - require.True(t, isNilCredentialStore(nil)) - require.True(t, isNilCredentialStore(MapCredentialStore(nil))) - var noMap MapCredentialStore - require.True(t, isNilCredentialStore(noMap)) - - require.False(t, isNilCredentialStore(MapCredentialStore{"AKIA": "x"})) - // An empty (but non-nil) map is not considered nil — it is a valid - // "no credentials configured, reject all logins" posture; the - // wiring layer (main_admin.go) is responsible for refusing that - // shape when admin is enabled. - require.False(t, isNilCredentialStore(MapCredentialStore{})) + cases := []struct { + name string + store CredentialStore + want bool + }{ + {"untyped nil", nil, true}, + {"typed nil map literal", MapCredentialStore(nil), true}, + {"zero-value MapCredentialStore variable", noMap, true}, + {"non-empty map", MapCredentialStore{"AKIA": "x"}, false}, + // An empty (but non-nil) map is not considered nil — it + // is a valid "no credentials configured, reject all + // logins" posture; the wiring layer (main_admin.go) is + // responsible for refusing that shape when admin is + // enabled. + {"empty non-nil map", MapCredentialStore{}, false}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + require.Equal(t, tc.want, isNilCredentialStore(tc.store)) + }) + } } diff --git a/internal/admin/server_test.go b/internal/admin/server_test.go index 2b421572..f71e9d80 100644 --- a/internal/admin/server_test.go +++ b/internal/admin/server_test.go @@ -166,14 +166,3 @@ func TestServer_WriteRejectsMissingCSRF(t *testing.T) { srv.Handler().ServeHTTP(rec, req) require.Contains(t, []int{http.StatusUnauthorized, http.StatusForbidden, http.StatusMethodNotAllowed}, rec.Code) } - -// Smoke-test that ctxClosed and endpointMatches compile & work; these -// are small internal helpers that would otherwise skew coverage. -func TestInternalHelpers(t *testing.T) { - ctx, cancel := context.WithCancel(context.Background()) - require.False(t, ctxClosed(ctx)) - cancel() - require.True(t, ctxClosed(ctx)) - require.True(t, endpointMatches("/a", "/A")) - require.False(t, endpointMatches("/a", "/b")) -} diff --git a/main.go b/main.go index 02a3820d..36a2e86c 100644 --- a/main.go +++ b/main.go @@ -102,16 +102,18 @@ var ( raftDynamoMap = flag.String("raftDynamoMap", "", "Map of Raft address to DynamoDB address (raftAddr=dynamoAddr,...)") raftSqsMap = flag.String("raftSqsMap", "", "Map of Raft address to SQS address (raftAddr=sqsAddr,...)") - adminEnabled = flag.Bool("adminEnabled", false, "Enable the admin HTTP listener") - adminListen = flag.String("adminListen", "127.0.0.1:8080", "host:port for the admin HTTP listener (loopback by default)") - adminTLSCertFile = flag.String("adminTLSCertFile", "", "PEM-encoded TLS certificate for the admin listener") - adminTLSKeyFile = flag.String("adminTLSKeyFile", "", "PEM-encoded TLS private key for the admin listener") - adminAllowPlaintextNonLoopback = flag.Bool("adminAllowPlaintextNonLoopback", false, "Allow the admin listener to bind a non-loopback address without TLS (strongly discouraged)") - adminAllowInsecureDevCookie = flag.Bool("adminAllowInsecureDevCookie", false, "Mint admin cookies without the Secure attribute (local plaintext dev only)") - adminSessionSigningKey = flag.String("adminSessionSigningKey", "", "Cluster-shared base64 HS256 key (64 bytes decoded) that signs admin session tokens") - adminSessionSigningKeyPrevious = flag.String("adminSessionSigningKeyPrevious", "", "Optional previous admin HS256 key accepted only for verification during rotation") - adminReadOnlyAccessKeys = flag.String("adminReadOnlyAccessKeys", "", "Comma-separated SigV4 access keys granted read-only admin access") - adminFullAccessKeys = flag.String("adminFullAccessKeys", "", "Comma-separated SigV4 access keys granted full-access admin role") + adminEnabled = flag.Bool("adminEnabled", false, "Enable the admin HTTP listener") + adminListen = flag.String("adminListen", "127.0.0.1:8080", "host:port for the admin HTTP listener (loopback by default)") + adminTLSCertFile = flag.String("adminTLSCertFile", "", "PEM-encoded TLS certificate for the admin listener") + adminTLSKeyFile = flag.String("adminTLSKeyFile", "", "PEM-encoded TLS private key for the admin listener") + adminAllowPlaintextNonLoopback = flag.Bool("adminAllowPlaintextNonLoopback", false, "Allow the admin listener to bind a non-loopback address without TLS (strongly discouraged)") + adminAllowInsecureDevCookie = flag.Bool("adminAllowInsecureDevCookie", false, "Mint admin cookies without the Secure attribute (local plaintext dev only)") + adminSessionSigningKey = flag.String("adminSessionSigningKey", "", "Cluster-shared base64 HS256 key (64 bytes decoded); prefer -adminSessionSigningKeyFile / ELASTICKV_ADMIN_SESSION_SIGNING_KEY so the value does not appear in /proc//cmdline") + adminSessionSigningKeyFile = flag.String("adminSessionSigningKeyFile", "", "Path to a file containing the base64-encoded primary admin HS256 key; avoids leaking the secret via argv") + adminSessionSigningKeyPrevious = flag.String("adminSessionSigningKeyPrevious", "", "Optional previous admin HS256 key accepted only for verification during rotation; prefer -adminSessionSigningKeyPreviousFile") + adminSessionSigningKeyPreviousFile = flag.String("adminSessionSigningKeyPreviousFile", "", "Path to a file containing the base64-encoded previous admin HS256 key used for rotation") + adminReadOnlyAccessKeys = flag.String("adminReadOnlyAccessKeys", "", "Comma-separated SigV4 access keys granted read-only admin access") + adminFullAccessKeys = flag.String("adminFullAccessKeys", "", "Comma-separated SigV4 access keys granted full-access admin role") ) // memoryPressureExit is set to true by the memwatch OnExceed callback to diff --git a/main_admin.go b/main_admin.go index 7eeaf2aa..e50ed5c5 100644 --- a/main_admin.go +++ b/main_admin.go @@ -6,6 +6,7 @@ import ( "log/slog" "net" "net/http" + "os" "strings" "time" @@ -14,6 +15,14 @@ import ( "golang.org/x/sync/errgroup" ) +// Environment variables that the admin listener consults before +// falling back to the command-line flag values. Exposing secrets via +// env vars / file paths keeps them out of /proc//cmdline. +const ( + envAdminSessionSigningKey = "ELASTICKV_ADMIN_SESSION_SIGNING_KEY" + envAdminSessionSigningKeyPrevious = "ELASTICKV_ADMIN_SESSION_SIGNING_KEY_PREVIOUS" +) + const ( adminReadHeaderTimeout = 5 * time.Second adminWriteTimeout = 10 * time.Second @@ -75,6 +84,14 @@ func startAdminFromFlags(ctx context.Context, lc *net.ListenConfig, eg *errgroup return errors.New("admin listener is enabled but no static credentials are configured;" + " set -s3CredentialsFile to a file with at least one entry,") } + primaryKey, err := resolveSigningKey(*adminSessionSigningKey, *adminSessionSigningKeyFile, envAdminSessionSigningKey) + if err != nil { + return errors.Wrap(err, "resolve -adminSessionSigningKey") + } + previousKey, err := resolveSigningKey(*adminSessionSigningKeyPrevious, *adminSessionSigningKeyPreviousFile, envAdminSessionSigningKeyPrevious) + if err != nil { + return errors.Wrap(err, "resolve -adminSessionSigningKeyPrevious") + } cfg := adminListenerConfig{ enabled: *adminEnabled, listen: *adminListen, @@ -82,8 +99,8 @@ func startAdminFromFlags(ctx context.Context, lc *net.ListenConfig, eg *errgroup tlsKeyFile: *adminTLSKeyFile, allowPlaintextNonLoopback: *adminAllowPlaintextNonLoopback, allowInsecureDevCookie: *adminAllowInsecureDevCookie, - sessionSigningKey: *adminSessionSigningKey, - sessionSigningKeyPrevious: *adminSessionSigningKeyPrevious, + sessionSigningKey: primaryKey, + sessionSigningKeyPrevious: previousKey, readOnlyAccessKeys: parseCSV(*adminReadOnlyAccessKeys), fullAccessKeys: parseCSV(*adminFullAccessKeys), } @@ -137,7 +154,7 @@ func startAdminServer( if err != nil { return "", err } - httpSrv := newAdminHTTPServer(server, &adminCfg) + httpSrv := newAdminHTTPServer(server) listener, err := lc.Listen(ctx, "tcp", adminCfg.Listen) if err != nil { return "", errors.Wrapf(err, "failed to listen on admin address %s", adminCfg.Listen) @@ -204,8 +221,7 @@ func buildAdminHTTPServer(adminCfg *admin.Config, creds map[string]string, clust return server, nil } -func newAdminHTTPServer(server *admin.Server, adminCfg *admin.Config) *http.Server { - _ = adminCfg // reserved for future use (e.g. per-listener TLS overrides) +func newAdminHTTPServer(server *admin.Server) *http.Server { return &http.Server{ Handler: server.Handler(), ReadHeaderTimeout: adminReadHeaderTimeout, @@ -292,6 +308,26 @@ func newClusterInfoSource(nodeID, version string, runtimes []*raftGroupRuntime) }) } +// resolveSigningKey picks the effective admin signing key from, in +// priority order: the --*File flag (file contents), the env var, and +// finally the --*Flag argv value. Preferring the file/env paths keeps +// the raw base64 out of /proc//cmdline on Linux. Returns the empty +// string when every source is unset — callers that require a value +// (validated elsewhere) must handle that case themselves. +func resolveSigningKey(flagValue, filePath, envVar string) (string, error) { + if strings.TrimSpace(filePath) != "" { + b, err := os.ReadFile(filePath) + if err != nil { + return "", errors.Wrapf(err, "read admin signing key file %q", filePath) + } + return strings.TrimSpace(string(b)), nil + } + if v := strings.TrimSpace(os.Getenv(envVar)); v != "" { + return v, nil + } + return strings.TrimSpace(flagValue), nil +} + // parseCSV splits a flag value like "a,b,c" into a slice with empty and // whitespace-only entries dropped. It is not in shard_config.go because // admin's comma-separated list format is simpler than raft groups. diff --git a/main_admin_test.go b/main_admin_test.go index c0f91302..13ecd07a 100644 --- a/main_admin_test.go +++ b/main_admin_test.go @@ -178,7 +178,10 @@ func TestStartAdminServer_ServesTLS(t *testing.T) { addr, err := startAdminServer(eCtx, &lc, eg, cfg, map[string]string{}, cluster, "test") require.NoError(t, err) - transport := &http.Transport{TLSClientConfig: &tls.Config{InsecureSkipVerify: true}} //nolint:gosec // self-signed in test + transport := &http.Transport{TLSClientConfig: &tls.Config{ + InsecureSkipVerify: true, //nolint:gosec // self-signed certificate in test; server identity is not what we assert here. + MinVersion: tls.VersionTLS12, + }} client := &http.Client{Transport: transport, Timeout: 2 * time.Second} deadline := time.Now().Add(3 * time.Second) var resp *http.Response From 77d656c4a1aa8d243c383effa6bf9881d7c48405 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Sat, 25 Apr 2026 06:16:23 +0900 Subject: [PATCH 14/15] admin: tidy doc drift + error text (Copilot review) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Four small style findings from the latest Copilot review: - main_admin: drop the trailing comma and fix the stray leading space in the "no static credentials" error so operators see a clean, copy/pasteable instruction. - auth_handler.HandleLogout: the comment still said the handler "does not require authentication", but after the logout-CSRF fix the route is wired through SessionAuth + CSRF. Rewrite the comment to describe the actual behaviour so future readers do not loosen the protection based on outdated docs. - cluster_handler.NewClusterHandler: the doc mentioned "passing a nil logger falls back to slog.Default()", but the constructor does not take a logger argument — the logger is seeded with slog.Default() internally and overridden via WithLogger. Reword the doc to describe the actual API. - auth_handler.authenticate: drop the redundant secretCompareKeyOnce.Do(initSecretCompareKey) at the top of the function; digestForCompare already runs the once-guard internally. No behavioural change beyond the string tidy. Tests + lint green. --- internal/admin/auth_handler.go | 13 ++++++++----- internal/admin/cluster_handler.go | 5 +++-- main_admin.go | 4 ++-- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/internal/admin/auth_handler.go b/internal/admin/auth_handler.go index a1aa9ad7..31b38d60 100644 --- a/internal/admin/auth_handler.go +++ b/internal/admin/auth_handler.go @@ -265,7 +265,8 @@ func digestForCompare(s string) []byte { } func (s *AuthService) authenticate(w http.ResponseWriter, req loginRequest) (AuthPrincipal, bool) { - secretCompareKeyOnce.Do(initSecretCompareKey) + // digestForCompare runs secretCompareKeyOnce.Do internally, so by + // the time it returns unknownKeyPlaceholder is already populated. providedHash := digestForCompare(req.SecretKey) expected, known := s.creds.LookupSecret(req.AccessKey) expectedHash := unknownKeyPlaceholder @@ -316,10 +317,12 @@ func (s *AuthService) issueSession(w http.ResponseWriter, principal AuthPrincipa _ = json.NewEncoder(w).Encode(loginResponse{Role: principal.Role, ExpiresAt: expires}) } -// HandleLogout clears both cookies. It does not require authentication — -// clearing stale cookies after a session has expired is always safe. We -// best-effort decode the incoming session cookie so the audit log can -// record who logged out; a missing or invalid cookie leaves actor empty. +// HandleLogout clears both cookies. The route is wired behind the +// protected middleware chain (SessionAuth + CSRF), so unauthenticated +// or cross-site callers are rejected before they reach this handler — +// that is what prevents logout-CSRF. We still best-effort decode the +// incoming session cookie so the audit log can record who logged out; +// a missing or invalid cookie leaves actor empty. func (s *AuthService) HandleLogout(w http.ResponseWriter, r *http.Request) { rec := newStatusRecorder(w) defer s.auditLogout(r, rec) diff --git a/internal/admin/cluster_handler.go b/internal/admin/cluster_handler.go index 81b421e1..d0be3d97 100644 --- a/internal/admin/cluster_handler.go +++ b/internal/admin/cluster_handler.go @@ -51,8 +51,9 @@ type ClusterHandler struct { logger *slog.Logger } -// NewClusterHandler wires a source into the HTTP handler. Passing a nil -// logger falls back to slog.Default() so existing callers keep working. +// NewClusterHandler wires a source into the HTTP handler and seeds +// logging with slog.Default(). Callers that want a tagged logger can +// chain WithLogger(...) on the returned handler. func NewClusterHandler(source ClusterInfoSource) *ClusterHandler { return &ClusterHandler{source: source, logger: slog.Default()} } diff --git a/main_admin.go b/main_admin.go index e50ed5c5..a640d7ff 100644 --- a/main_admin.go +++ b/main_admin.go @@ -81,8 +81,8 @@ func startAdminFromFlags(ctx context.Context, lc *net.ListenConfig, eg *errgroup // untyped `== nil` check cannot detect a nil-map-valued interface // on its own). if len(staticCreds) == 0 { - return errors.New("admin listener is enabled but no static credentials are configured;" + - " set -s3CredentialsFile to a file with at least one entry,") + return errors.New("admin listener is enabled but no static credentials are configured; " + + "set -s3CredentialsFile to a file with at least one entry") } primaryKey, err := resolveSigningKey(*adminSessionSigningKey, *adminSessionSigningKeyFile, envAdminSessionSigningKey) if err != nil { From 679d60500ac08430290c2909d1cf40640056947b Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Sat, 25 Apr 2026 06:30:25 +0900 Subject: [PATCH 15/15] admin: equalize HMAC work across login branches (Codex P2) The previous precompute of unknownKeyPlaceholder (done one commit earlier to satisfy CodeRabbit) created the very problem Codex just flagged: the known-access-key branch called digestForCompare on the expected secret every time, while the unknown branch reused the precomputed digest for free. The difference in HMAC work was large enough to enumerate valid access keys via login latency. Fix: hash a deterministic placeholder secret fresh on every unknown branch so both paths always run exactly one HMAC-SHA256. The hot path is still O(1) HMAC; it is not the free-read the precompute would have given us, but it is what the timing-equality contract requires. Also drop the leftover `unknownKeyPlaceholder` package variable and its precompute inside initSecretCompareKey, since the new branch no longer needs a cached digest. gosec flags the deterministic placeholder string as a potential credential; tag the const with a //nolint:gosec comment that explains why it is not one. --- internal/admin/auth_handler.go | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/internal/admin/auth_handler.go b/internal/admin/auth_handler.go index 31b38d60..6804a653 100644 --- a/internal/admin/auth_handler.go +++ b/internal/admin/auth_handler.go @@ -234,11 +234,19 @@ func readLoginRequest(w http.ResponseWriter, r *http.Request) (loginRequest, boo // expensive KDF would add latency to every login attempt without // changing the threat model. var ( - secretCompareKey []byte - secretCompareKeyOnce sync.Once - unknownKeyPlaceholder []byte + secretCompareKey []byte + secretCompareKeyOnce sync.Once ) +// unknownKeySecretPlaceholder is a deterministic dummy we hash when +// the incoming access key is unknown. We hash it fresh on every call +// (rather than precomputing the digest once) so the unknown-key +// branch performs the same HMAC work as the known-key branch — +// otherwise an attacker could enumerate valid access keys by +// measuring login latency. It is NOT a credential: nothing grants +// access to any resource, and grepping for it finds only this file. +const unknownKeySecretPlaceholder = "admin-auth-unknown-key-placeholder" //nolint:gosec // intentional non-credential sentinel; see comment above. + func initSecretCompareKey() { secretCompareKey = make([]byte, sha256.Size) if _, err := rand.Read(secretCompareKey); err != nil { @@ -247,12 +255,6 @@ func initSecretCompareKey() { // cannot authenticate anyone anyway. panic("admin: crypto/rand failure while initialising secret compare key: " + err.Error()) } - // Materialise the unknown-key-placeholder digest exactly once so - // the login failure hot path does not re-run HMAC on every try; - // the value is deterministic given secretCompareKey. - mac := hmac.New(sha256.New, secretCompareKey) - mac.Write([]byte("admin-auth-unknown-key-placeholder")) - unknownKeyPlaceholder = mac.Sum(nil) } // digestForCompare returns HMAC-SHA256(secretCompareKey, s). Used only @@ -265,14 +267,18 @@ func digestForCompare(s string) []byte { } func (s *AuthService) authenticate(w http.ResponseWriter, req loginRequest) (AuthPrincipal, bool) { - // digestForCompare runs secretCompareKeyOnce.Do internally, so by - // the time it returns unknownKeyPlaceholder is already populated. providedHash := digestForCompare(req.SecretKey) expected, known := s.creds.LookupSecret(req.AccessKey) - expectedHash := unknownKeyPlaceholder - if known { - expectedHash = digestForCompare(expected) + // Compute the expected digest fresh in BOTH branches so the + // amount of HMAC work is identical regardless of whether the + // access key is known. A precomputed placeholder digest would + // make the unknown-key path measurably faster, letting an + // attacker enumerate valid access keys via login latency. + expectedSecret := expected + if !known { + expectedSecret = unknownKeySecretPlaceholder } + expectedHash := digestForCompare(expectedSecret) match := subtle.ConstantTimeCompare(providedHash, expectedHash) == 1 if !known || !match { writeJSONError(w, http.StatusUnauthorized, "invalid_credentials",