Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions charts/agentevals/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ spec:
value: {{ .Values.database.postgres.schema | quote }}
- name: AGENTEVALS_AUTO_MIGRATE
value: {{ .Values.database.postgres.autoMigrate | quote }}
- name: AGENTEVALS_DB_CONNECT_TIMEOUT_S
value: {{ .Values.database.postgres.connectTimeoutSeconds | quote }}
{{- if .Values.database.postgres.urlFile }}
- name: AGENTEVALS_DATABASE_URL_FILE
value: {{ .Values.database.postgres.urlFile | quote }}
Expand Down
6 changes: 6 additions & 0 deletions charts/agentevals/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,12 @@ database:
# false the server refuses to start if the schema is behind or dirty;
# run "agentevals migrate up" manually in that case.
autoMigrate: true
# -- Seconds the startup will spend retrying the initial Postgres
# connection before the pod aborts. Default 600s matches the chart's
# hard-coded startupProbe budget (failureThreshold 60 x periodSeconds
# 10). Going above 600s requires overriding the probe in your own
# downstream template.
connectTimeoutSeconds: 600
# -- Bundled Postgres instance for development and evaluation only.
# Not suitable for production. Deployed when enabled is true and url /
# urlFile are not set.
Expand Down
53 changes: 45 additions & 8 deletions src/agentevals/storage/postgres/migrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@

import asyncio
import logging
import math
import os
import re
from dataclasses import dataclass
from importlib.resources import files
Expand Down Expand Up @@ -252,23 +254,58 @@ def discover_migrations() -> list[Migration]:
return _discover_migrations()


CONNECT_RETRY_DEADLINE_S = 60.0
"""Total wall-clock budget for the initial Postgres connection. Bundled PG
in Kubernetes typically takes 5-15s to be ready (PVC bind, initdb, listener
bind), so the agentevals lifespan can race the database on a fresh deploy.
Retrying tolerates that gap rather than failing pod startup and relying on
CrashLoopBackOff timing to eventually line up."""
CONNECT_RETRY_DEADLINE_S = 600.0
"""Default total wall-clock budget for the initial Postgres connection.
Sized to span Kubernetes bring-up of a freshly provisioned database (PVC
bind, initdb, listener bind, network policy propagation). Override at
runtime by setting ``AGENTEVALS_DB_CONNECT_TIMEOUT_S`` to a positive
number of seconds; an invalid value logs a warning and falls back to this
default."""


def connect_deadline_seconds() -> float:
"""Resolve the connect-retry budget. Reads ``AGENTEVALS_DB_CONNECT_TIMEOUT_S``
and falls back to :data:`CONNECT_RETRY_DEADLINE_S` if the env var is
unset, empty, non-numeric, non-finite, or non-positive."""
raw = os.getenv("AGENTEVALS_DB_CONNECT_TIMEOUT_S")
if raw is None or raw == "":
return CONNECT_RETRY_DEADLINE_S
try:
val = float(raw)
except ValueError:
logger.warning(
"Invalid AGENTEVALS_DB_CONNECT_TIMEOUT_S=%r (not a number); using default %.0fs",
raw,
CONNECT_RETRY_DEADLINE_S,
)
return CONNECT_RETRY_DEADLINE_S
if not math.isfinite(val):
logger.warning(
"Invalid AGENTEVALS_DB_CONNECT_TIMEOUT_S=%r (must be finite); using default %.0fs",
raw,
CONNECT_RETRY_DEADLINE_S,
)
return CONNECT_RETRY_DEADLINE_S
if val <= 0:
logger.warning(
"Invalid AGENTEVALS_DB_CONNECT_TIMEOUT_S=%r (must be positive); using default %.0fs",
raw,
CONNECT_RETRY_DEADLINE_S,
)
return CONNECT_RETRY_DEADLINE_S
return val
Comment thread
krisztianfekete marked this conversation as resolved.


async def connect_with_retry(dsn: str, asyncpg_module) -> "asyncpg.Connection":
"""Open a single asyncpg connection, retrying on connection-refused or
server-not-ready errors for up to ``CONNECT_RETRY_DEADLINE_S`` seconds.
server-not-ready errors for up to :func:`connect_deadline_seconds`
seconds.

Connection-time errors are tolerated; once a connection has been
established and a query returned, all subsequent failures propagate
normally.
"""
deadline = asyncio.get_event_loop().time() + CONNECT_RETRY_DEADLINE_S
deadline = asyncio.get_event_loop().time() + connect_deadline_seconds()
delay = 0.5
while True:
try:
Expand Down
4 changes: 2 additions & 2 deletions src/agentevals/storage/postgres/pool.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,9 @@ async def create_pool(settings: StorageSettings) -> "asyncpg.Pool":
settings.schema_name,
)

from .migrator import CONNECT_RETRY_DEADLINE_S
from .migrator import connect_deadline_seconds

deadline = asyncio.get_event_loop().time() + CONNECT_RETRY_DEADLINE_S
deadline = asyncio.get_event_loop().time() + connect_deadline_seconds()
delay = 0.5
while True:
try:
Expand Down
58 changes: 58 additions & 0 deletions tests/storage/test_migrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,20 @@

from __future__ import annotations

import logging
import os
import re

import pytest

from agentevals.storage.postgres.migrator import (
ADVISORY_LOCK_KEY,
CONNECT_RETRY_DEADLINE_S,
Migration,
Migrator,
_apply_schema,
_discover_migrations,
connect_deadline_seconds,
discover_migrations,
)

Expand Down Expand Up @@ -71,6 +74,61 @@ def test_stable(self):
assert ADVISORY_LOCK_KEY == 7259820376655812345


class TestConnectDeadlineSeconds:
"""``connect_deadline_seconds`` resolves AGENTEVALS_DB_CONNECT_TIMEOUT_S
to a float, falling back to CONNECT_RETRY_DEADLINE_S on any input the
retry loop cannot consume. Each failure mode logs at WARNING so the
cause is diagnosable from pod logs."""

@pytest.fixture(autouse=True)
def _clean_env(self, monkeypatch):
monkeypatch.delenv("AGENTEVALS_DB_CONNECT_TIMEOUT_S", raising=False)

def test_unset_returns_default(self):
assert connect_deadline_seconds() == CONNECT_RETRY_DEADLINE_S

def test_empty_returns_default(self, monkeypatch):
monkeypatch.setenv("AGENTEVALS_DB_CONNECT_TIMEOUT_S", "")
assert connect_deadline_seconds() == CONNECT_RETRY_DEADLINE_S

@pytest.mark.parametrize(
("raw", "expected"),
[
("42", 42.0),
("120.5", 120.5),
("0.1", 0.1),
("3600", 3600.0),
],
)
def test_parses_valid_positive_values(self, monkeypatch, raw, expected):
monkeypatch.setenv("AGENTEVALS_DB_CONNECT_TIMEOUT_S", raw)
assert connect_deadline_seconds() == expected

@pytest.mark.parametrize(
("raw", "reason_substring"),
[
("foo", "not a number"),
("nan", "must be finite"),
("inf", "must be finite"),
("-inf", "must be finite"),
("0", "must be positive"),
("-5", "must be positive"),
],
)
def test_invalid_values_fall_back_with_warning(self, monkeypatch, caplog, raw, reason_substring):
"""Bad inputs return the default and log exactly one warning that
names the specific validation branch. The cardinality check
guards against a refactor that double-logs (e.g. emits both a
generic and a specific message)."""
monkeypatch.setenv("AGENTEVALS_DB_CONNECT_TIMEOUT_S", raw)
with caplog.at_level(logging.WARNING, logger="agentevals.storage.postgres.migrator"):
result = connect_deadline_seconds()
assert result == CONNECT_RETRY_DEADLINE_S
warnings = [r.getMessage() for r in caplog.records if r.levelname == "WARNING"]
assert len(warnings) == 1, f"expected one warning, got {warnings}"
assert reason_substring in warnings[0]


class TestMigrationFilePattern:
def test_filename_format(self):
migrations = _discover_migrations()
Expand Down