From 3881f4a997129e677f03e680a8b64e9c02a907f2 Mon Sep 17 00:00:00 2001 From: krisztianfekete Date: Thu, 14 May 2026 13:25:20 +0200 Subject: [PATCH 1/2] make Postgres connect timeout configurable --- charts/agentevals/templates/deployment.yaml | 2 + charts/agentevals/values.yaml | 5 +++ src/agentevals/storage/postgres/migrator.py | 45 +++++++++++++++++---- src/agentevals/storage/postgres/pool.py | 4 +- 4 files changed, 46 insertions(+), 10 deletions(-) diff --git a/charts/agentevals/templates/deployment.yaml b/charts/agentevals/templates/deployment.yaml index b358b1a..2fe9780 100644 --- a/charts/agentevals/templates/deployment.yaml +++ b/charts/agentevals/templates/deployment.yaml @@ -77,6 +77,8 @@ spec: value: {{ .Values.database.postgres.schema | quote }} - name: AGENTEVALS_AUTO_MIGRATE value: {{ .Values.database.postgres.autoMigrate | quote }} + - name: AGENTEVALS_DB_CONNECT_TIMEOUT_S + value: {{ .Values.database.postgres.connectTimeoutSeconds | quote }} {{- if .Values.database.postgres.urlFile }} - name: AGENTEVALS_DATABASE_URL_FILE value: {{ .Values.database.postgres.urlFile | quote }} diff --git a/charts/agentevals/values.yaml b/charts/agentevals/values.yaml index e7c9c49..5a7a0a1 100644 --- a/charts/agentevals/values.yaml +++ b/charts/agentevals/values.yaml @@ -211,6 +211,11 @@ database: # false the server refuses to start if the schema is behind or dirty; # run "agentevals migrate up" manually in that case. autoMigrate: true + # -- Seconds the startup will spend retrying the initial Postgres + # connection before the pod aborts. Default 600s spans bring-up of a + # freshly provisioned database. Raise startupProbe.failureThreshold + # to match if you increase this. + connectTimeoutSeconds: 600 # -- Bundled Postgres instance for development and evaluation only. # Not suitable for production. Deployed when enabled is true and url / # urlFile are not set. diff --git a/src/agentevals/storage/postgres/migrator.py b/src/agentevals/storage/postgres/migrator.py index 4c09c76..a9bb6c1 100644 --- a/src/agentevals/storage/postgres/migrator.py +++ b/src/agentevals/storage/postgres/migrator.py @@ -13,6 +13,7 @@ import asyncio import logging +import os import re from dataclasses import dataclass from importlib.resources import files @@ -252,23 +253,51 @@ def discover_migrations() -> list[Migration]: return _discover_migrations() -CONNECT_RETRY_DEADLINE_S = 60.0 -"""Total wall-clock budget for the initial Postgres connection. Bundled PG -in Kubernetes typically takes 5-15s to be ready (PVC bind, initdb, listener -bind), so the agentevals lifespan can race the database on a fresh deploy. -Retrying tolerates that gap rather than failing pod startup and relying on -CrashLoopBackOff timing to eventually line up.""" +CONNECT_RETRY_DEADLINE_S = 600.0 +"""Default total wall-clock budget for the initial Postgres connection. +Sized to span Kubernetes bring-up of a freshly provisioned database (PVC +bind, initdb, listener bind, network policy propagation). Override at +runtime by setting ``AGENTEVALS_DB_CONNECT_TIMEOUT_S`` to a positive +number of seconds; an invalid value logs a warning and falls back to this +default.""" + + +def connect_deadline_seconds() -> float: + """Resolve the connect-retry budget. Reads ``AGENTEVALS_DB_CONNECT_TIMEOUT_S`` + and falls back to :data:`CONNECT_RETRY_DEADLINE_S` if the env var is + unset, empty, non-numeric, or non-positive.""" + raw = os.getenv("AGENTEVALS_DB_CONNECT_TIMEOUT_S") + if raw is None or raw == "": + return CONNECT_RETRY_DEADLINE_S + try: + val = float(raw) + except ValueError: + logger.warning( + "Invalid AGENTEVALS_DB_CONNECT_TIMEOUT_S=%r (not a number); using default %.0fs", + raw, + CONNECT_RETRY_DEADLINE_S, + ) + return CONNECT_RETRY_DEADLINE_S + if val <= 0: + logger.warning( + "Invalid AGENTEVALS_DB_CONNECT_TIMEOUT_S=%r (must be positive); using default %.0fs", + raw, + CONNECT_RETRY_DEADLINE_S, + ) + return CONNECT_RETRY_DEADLINE_S + return val async def connect_with_retry(dsn: str, asyncpg_module) -> "asyncpg.Connection": """Open a single asyncpg connection, retrying on connection-refused or - server-not-ready errors for up to ``CONNECT_RETRY_DEADLINE_S`` seconds. + server-not-ready errors for up to :func:`connect_deadline_seconds` + seconds. Connection-time errors are tolerated; once a connection has been established and a query returned, all subsequent failures propagate normally. """ - deadline = asyncio.get_event_loop().time() + CONNECT_RETRY_DEADLINE_S + deadline = asyncio.get_event_loop().time() + connect_deadline_seconds() delay = 0.5 while True: try: diff --git a/src/agentevals/storage/postgres/pool.py b/src/agentevals/storage/postgres/pool.py index f1446e0..0fd39d7 100644 --- a/src/agentevals/storage/postgres/pool.py +++ b/src/agentevals/storage/postgres/pool.py @@ -50,9 +50,9 @@ async def create_pool(settings: StorageSettings) -> "asyncpg.Pool": settings.schema_name, ) - from .migrator import CONNECT_RETRY_DEADLINE_S + from .migrator import connect_deadline_seconds - deadline = asyncio.get_event_loop().time() + CONNECT_RETRY_DEADLINE_S + deadline = asyncio.get_event_loop().time() + connect_deadline_seconds() delay = 0.5 while True: try: From ceddb4fadd34467d25802e1e4c527a82c9e88206 Mon Sep 17 00:00:00 2001 From: krisztianfekete Date: Thu, 14 May 2026 13:38:36 +0200 Subject: [PATCH 2/2] address review comments --- charts/agentevals/values.yaml | 7 +-- src/agentevals/storage/postgres/migrator.py | 10 +++- tests/storage/test_migrator.py | 58 +++++++++++++++++++++ 3 files changed, 71 insertions(+), 4 deletions(-) diff --git a/charts/agentevals/values.yaml b/charts/agentevals/values.yaml index 5a7a0a1..1b588a9 100644 --- a/charts/agentevals/values.yaml +++ b/charts/agentevals/values.yaml @@ -212,9 +212,10 @@ database: # run "agentevals migrate up" manually in that case. autoMigrate: true # -- Seconds the startup will spend retrying the initial Postgres - # connection before the pod aborts. Default 600s spans bring-up of a - # freshly provisioned database. Raise startupProbe.failureThreshold - # to match if you increase this. + # connection before the pod aborts. Default 600s matches the chart's + # hard-coded startupProbe budget (failureThreshold 60 x periodSeconds + # 10). Going above 600s requires overriding the probe in your own + # downstream template. connectTimeoutSeconds: 600 # -- Bundled Postgres instance for development and evaluation only. # Not suitable for production. Deployed when enabled is true and url / diff --git a/src/agentevals/storage/postgres/migrator.py b/src/agentevals/storage/postgres/migrator.py index a9bb6c1..bcb9145 100644 --- a/src/agentevals/storage/postgres/migrator.py +++ b/src/agentevals/storage/postgres/migrator.py @@ -13,6 +13,7 @@ import asyncio import logging +import math import os import re from dataclasses import dataclass @@ -265,7 +266,7 @@ def discover_migrations() -> list[Migration]: def connect_deadline_seconds() -> float: """Resolve the connect-retry budget. Reads ``AGENTEVALS_DB_CONNECT_TIMEOUT_S`` and falls back to :data:`CONNECT_RETRY_DEADLINE_S` if the env var is - unset, empty, non-numeric, or non-positive.""" + unset, empty, non-numeric, non-finite, or non-positive.""" raw = os.getenv("AGENTEVALS_DB_CONNECT_TIMEOUT_S") if raw is None or raw == "": return CONNECT_RETRY_DEADLINE_S @@ -278,6 +279,13 @@ def connect_deadline_seconds() -> float: CONNECT_RETRY_DEADLINE_S, ) return CONNECT_RETRY_DEADLINE_S + if not math.isfinite(val): + logger.warning( + "Invalid AGENTEVALS_DB_CONNECT_TIMEOUT_S=%r (must be finite); using default %.0fs", + raw, + CONNECT_RETRY_DEADLINE_S, + ) + return CONNECT_RETRY_DEADLINE_S if val <= 0: logger.warning( "Invalid AGENTEVALS_DB_CONNECT_TIMEOUT_S=%r (must be positive); using default %.0fs", diff --git a/tests/storage/test_migrator.py b/tests/storage/test_migrator.py index 5f3bc28..f9765f7 100644 --- a/tests/storage/test_migrator.py +++ b/tests/storage/test_migrator.py @@ -7,6 +7,7 @@ from __future__ import annotations +import logging import os import re @@ -14,10 +15,12 @@ from agentevals.storage.postgres.migrator import ( ADVISORY_LOCK_KEY, + CONNECT_RETRY_DEADLINE_S, Migration, Migrator, _apply_schema, _discover_migrations, + connect_deadline_seconds, discover_migrations, ) @@ -71,6 +74,61 @@ def test_stable(self): assert ADVISORY_LOCK_KEY == 7259820376655812345 +class TestConnectDeadlineSeconds: + """``connect_deadline_seconds`` resolves AGENTEVALS_DB_CONNECT_TIMEOUT_S + to a float, falling back to CONNECT_RETRY_DEADLINE_S on any input the + retry loop cannot consume. Each failure mode logs at WARNING so the + cause is diagnosable from pod logs.""" + + @pytest.fixture(autouse=True) + def _clean_env(self, monkeypatch): + monkeypatch.delenv("AGENTEVALS_DB_CONNECT_TIMEOUT_S", raising=False) + + def test_unset_returns_default(self): + assert connect_deadline_seconds() == CONNECT_RETRY_DEADLINE_S + + def test_empty_returns_default(self, monkeypatch): + monkeypatch.setenv("AGENTEVALS_DB_CONNECT_TIMEOUT_S", "") + assert connect_deadline_seconds() == CONNECT_RETRY_DEADLINE_S + + @pytest.mark.parametrize( + ("raw", "expected"), + [ + ("42", 42.0), + ("120.5", 120.5), + ("0.1", 0.1), + ("3600", 3600.0), + ], + ) + def test_parses_valid_positive_values(self, monkeypatch, raw, expected): + monkeypatch.setenv("AGENTEVALS_DB_CONNECT_TIMEOUT_S", raw) + assert connect_deadline_seconds() == expected + + @pytest.mark.parametrize( + ("raw", "reason_substring"), + [ + ("foo", "not a number"), + ("nan", "must be finite"), + ("inf", "must be finite"), + ("-inf", "must be finite"), + ("0", "must be positive"), + ("-5", "must be positive"), + ], + ) + def test_invalid_values_fall_back_with_warning(self, monkeypatch, caplog, raw, reason_substring): + """Bad inputs return the default and log exactly one warning that + names the specific validation branch. The cardinality check + guards against a refactor that double-logs (e.g. emits both a + generic and a specific message).""" + monkeypatch.setenv("AGENTEVALS_DB_CONNECT_TIMEOUT_S", raw) + with caplog.at_level(logging.WARNING, logger="agentevals.storage.postgres.migrator"): + result = connect_deadline_seconds() + assert result == CONNECT_RETRY_DEADLINE_S + warnings = [r.getMessage() for r in caplog.records if r.levelname == "WARNING"] + assert len(warnings) == 1, f"expected one warning, got {warnings}" + assert reason_substring in warnings[0] + + class TestMigrationFilePattern: def test_filename_format(self): migrations = _discover_migrations()