Skip to content

Commit

Permalink
Initial implementation of ARI (#286)
Browse files Browse the repository at this point in the history
* Initial implementation of ARI

* Enhance redundancy, robustness, and logging

* Improve ARI updating; integrate on-demand TLS; detect changed window
  • Loading branch information
mholt committed May 7, 2024
1 parent fa7161a commit 0e88b3e
Show file tree
Hide file tree
Showing 11 changed files with 528 additions and 100 deletions.
77 changes: 45 additions & 32 deletions acmeclient.go
Original file line number Diff line number Diff line change
Expand Up @@ -137,44 +137,21 @@ func (iss *ACMEIssuer) newACMEClientWithAccount(ctx context.Context, useTestCA,
// independent of any particular ACME account. If useTestCA is true, am.TestCA
// will be used if it is set; otherwise, the primary CA will be used.
func (iss *ACMEIssuer) newACMEClient(useTestCA bool) (*acmez.Client, error) {
// ensure defaults are filled in
var caURL string
if useTestCA {
caURL = iss.TestCA
}
if caURL == "" {
caURL = iss.CA
client, err := iss.newBasicACMEClient()
if err != nil {
return nil, err
}
if caURL == "" {
caURL = DefaultACME.CA

// fill in a little more beyond a basic client
if useTestCA && iss.TestCA != "" {
client.Client.Directory = iss.TestCA
}
certObtainTimeout := iss.CertObtainTimeout
if certObtainTimeout == 0 {
certObtainTimeout = DefaultACME.CertObtainTimeout
}

// ensure endpoint is secure (assume HTTPS if scheme is missing)
if !strings.Contains(caURL, "://") {
caURL = "https://" + caURL
}
u, err := url.Parse(caURL)
if err != nil {
return nil, err
}
if u.Scheme != "https" && !SubjectIsInternal(u.Host) {
return nil, fmt.Errorf("%s: insecure CA URL (HTTPS required for non-internal CA)", caURL)
}

client := &acmez.Client{
Client: &acme.Client{
Directory: caURL,
PollTimeout: certObtainTimeout,
UserAgent: buildUAString(),
HTTPClient: iss.httpClient,
},
ChallengeSolvers: make(map[string]acmez.Solver),
}
client.Logger = iss.Logger.Named("acme_client")
client.Client.PollTimeout = certObtainTimeout
client.ChallengeSolvers = make(map[string]acmez.Solver)

// configure challenges (most of the time, DNS challenge is
// exclusive of other ones because it is usually only used
Expand Down Expand Up @@ -230,6 +207,42 @@ func (iss *ACMEIssuer) newACMEClient(useTestCA bool) (*acmez.Client, error) {
return client, nil
}

// newBasicACMEClient sets up a basically-functional ACME client that is not capable
// of solving challenges but can provide basic interactions with the server.
func (iss *ACMEIssuer) newBasicACMEClient() (*acmez.Client, error) {
caURL := iss.CA
if caURL == "" {
caURL = DefaultACME.CA
}
// ensure endpoint is secure (assume HTTPS if scheme is missing)
if !strings.Contains(caURL, "://") {
caURL = "https://" + caURL
}
u, err := url.Parse(caURL)
if err != nil {
return nil, err
}
if u.Scheme != "https" && !SubjectIsInternal(u.Host) {
return nil, fmt.Errorf("%s: insecure CA URL (HTTPS required for non-internal CA)", caURL)
}
return &acmez.Client{
Client: &acme.Client{
Directory: caURL,
UserAgent: buildUAString(),
HTTPClient: iss.httpClient,
Logger: iss.Logger.Named("acme_client"),
},
}, nil
}

func (iss *ACMEIssuer) getRenewalInfo(ctx context.Context, cert Certificate) (acme.RenewalInfo, error) {
acmeClient, err := iss.newBasicACMEClient()
if err != nil {
return acme.RenewalInfo{}, err
}
return acmeClient.GetRenewalInfo(ctx, cert.Certificate.Leaf)
}

func (iss *ACMEIssuer) getHTTPPort() int {
useHTTPPort := HTTPChallengePort
if HTTPPort > 0 && HTTPPort != HTTPChallengePort {
Expand Down
34 changes: 28 additions & 6 deletions acmeissuer.go
Original file line number Diff line number Diff line change
Expand Up @@ -362,12 +362,13 @@ func (am *ACMEIssuer) Issue(ctx context.Context, csr *x509.CertificateRequest) (
panic("missing config pointer (must use NewACMEIssuer)")
}

var isRetry bool
if attempts, ok := ctx.Value(AttemptsCtxKey).(*int); ok {
isRetry = *attempts > 0
var attempts int
if attemptsPtr, ok := ctx.Value(AttemptsCtxKey).(*int); ok {
attempts = *attemptsPtr
}
isRetry := attempts > 0

cert, usedTestCA, err := am.doIssue(ctx, csr, isRetry)
cert, usedTestCA, err := am.doIssue(ctx, csr, attempts)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -395,7 +396,7 @@ func (am *ACMEIssuer) Issue(ctx context.Context, csr *x509.CertificateRequest) (
// other endpoint. This is more likely to happen if a user is testing with
// the staging CA as the main CA, then changes their configuration once they
// think they are ready for the production endpoint.
cert, _, err = am.doIssue(ctx, csr, false)
cert, _, err = am.doIssue(ctx, csr, 0)
if err != nil {
// succeeded with test CA but failed just now with the production CA;
// either we are observing differing internal states of each CA that will
Expand Down Expand Up @@ -423,7 +424,8 @@ func (am *ACMEIssuer) Issue(ctx context.Context, csr *x509.CertificateRequest) (
return cert, err
}

func (am *ACMEIssuer) doIssue(ctx context.Context, csr *x509.CertificateRequest, useTestCA bool) (*IssuedCertificate, bool, error) {
func (am *ACMEIssuer) doIssue(ctx context.Context, csr *x509.CertificateRequest, attempts int) (*IssuedCertificate, bool, error) {
useTestCA := attempts > 0
client, err := am.newACMEClientWithAccount(ctx, useTestCA, false)
if err != nil {
return nil, false, err
Expand All @@ -449,6 +451,22 @@ func (am *ACMEIssuer) doIssue(ctx context.Context, csr *x509.CertificateRequest,
params.NotAfter = time.Now().Add(am.NotAfter)
}

// Notify the ACME server we are replacing a certificate (if the caller says we are),
// only if the following conditions are met:
// - The caller has set a Replaces value in the context, indicating this is a renewal.
// - Not using test CA. This should be obvious, but a test CA should be in a separate
// environment from production, and thus not have knowledge of the cert being replaced.
// - Not a certain attempt number. We skip setting Replaces once early on in the retries
// in case the reason the order is failing is only because there is a state inconsistency
// between client and server or some sort of bookkeeping error with regards to the certID
// and the server is rejecting the ARI certID. In any case, an invalid certID may cause
// orders to fail. So try once without setting it.
if !usingTestCA && attempts != 2 {
if replacing, ok := ctx.Value(ctxKeyARIReplaces).(*x509.Certificate); ok {
params.Replaces = replacing
}
}

// do this in a loop because there's an error case that may necessitate a retry, but not more than once
var certChains []acme.Certificate
for i := 0; i < 2; i++ {
Expand Down Expand Up @@ -631,6 +649,10 @@ const (
// prefixACME is the storage key prefix used for ACME-specific assets.
const prefixACME = "acme"

type ctxKey string

const ctxKeyARIReplaces = ctxKey("ari_replaces")

// Interface guards
var (
_ PreChecker = (*ACMEIssuer)(nil)
Expand Down
159 changes: 146 additions & 13 deletions certificates.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,15 @@ import (
"context"
"crypto/tls"
"crypto/x509"
"encoding/json"
"fmt"
"math/rand"
"net"
"os"
"strings"
"time"

"github.com/mholt/acmez/v2/acme"
"go.uber.org/zap"
"golang.org/x/crypto/ocsp"
)
Expand Down Expand Up @@ -56,6 +59,9 @@ type Certificate struct {

// The unique string identifying the issuer of this certificate.
issuerKey string

// ACME Renewal Information, if available
ari acme.RenewalInfo
}

// Empty returns true if the certificate struct is not filled out; at
Expand All @@ -67,10 +73,106 @@ func (cert Certificate) Empty() bool {
// Hash returns a checksum of the certificate chain's DER-encoded bytes.
func (cert Certificate) Hash() string { return cert.hash }

// NeedsRenewal returns true if the certificate is
// expiring soon (according to cfg) or has expired.
// NeedsRenewal returns true if the certificate is expiring
// soon (according to ARI and/or cfg) or has expired.
func (cert Certificate) NeedsRenewal(cfg *Config) bool {
return currentlyInRenewalWindow(cert.Leaf.NotBefore, expiresAt(cert.Leaf), cfg.RenewalWindowRatio)
return cfg.certNeedsRenewal(cert.Leaf, cert.ari, true)
}

// certNeedsRenewal consults ACME Renewal Info (ARI) and certificate expiration to determine
// whether the leaf certificate needs to be renewed yet. If true is returned, the certificate
// should be renewed as soon as possible. The reasoning for a true return value is logged
// unless emitLogs is false; this can be useful to suppress noisy logs in the case where you
// first call this to determine if a cert in memory needs renewal, and then right after you
// call it again to see if the cert in storage still needs renewal -- you probably don't want
// to log the second time for checking the cert in storage which is mainly for synchronization.
func (cfg *Config) certNeedsRenewal(leaf *x509.Certificate, ari acme.RenewalInfo, emitLogs bool) bool {
expiration := expiresAt(leaf)

var logger *zap.Logger
if emitLogs {
logger = cfg.Logger.With(
zap.Strings("subjects", leaf.DNSNames),
zap.Time("expiration", expiration),
zap.String("ari_cert_id", ari.UniqueIdentifier),
zap.Timep("next_ari_update", ari.RetryAfter),
zap.Duration("renew_check_interval", cfg.certCache.options.RenewCheckInterval),
zap.Time("window_start", ari.SuggestedWindow.Start),
zap.Time("window_end", ari.SuggestedWindow.End))
} else {
logger = zap.NewNop()
}

// first check ARI: if it says it's time to renew, it's time to renew
// (notice that we don't strictly require an ARI window to also exist; we presume
// that if a time has been selected, a window does or did exist, even if it didn't
// get stored/encoded for some reason - but also: this allows administrators to
// manually or explicitly schedule a renewal time indepedently of ARI which could
// be useful)
selectedTime := ari.SelectedTime

// if, for some reason a random time in the window hasn't been selected yet, but an ARI
// window does exist, we can always improvise one... even if this is called repeatedly,
// a random time is a random time, whether you generate it once or more :D
// (code borrowed from our acme package)
if selectedTime.IsZero() &&
(!ari.SuggestedWindow.Start.IsZero() && !ari.SuggestedWindow.End.IsZero()) {
start, end := ari.SuggestedWindow.Start.Unix()+1, ari.SuggestedWindow.End.Unix()
selectedTime = time.Unix(rand.Int63n(end-start)+start, 0).UTC()
logger.Warn("no renewal time had been selected with ARI; chose an ephemeral one for now",
zap.Time("ephemeral_selected_time", selectedTime))
}

// if a renewal time has been selected, start with that
if !selectedTime.IsZero() {
// ARI spec recommends an algorithm that renews after the randomly-selected
// time OR just before it if the next waking time would be after it; this
// cutoff can actually be before the start of the renewal window, but the spec
// author says that's OK: https://github.com/aarongable/draft-acme-ari/issues/71
cutoff := ari.SelectedTime.Add(-cfg.certCache.options.RenewCheckInterval)
if time.Now().After(cutoff) {
logger.Info("certificate needs renewal based on ARI window",
zap.Time("selected_time", selectedTime),
zap.Time("renewal_cutoff", cutoff))
return true
}

// according to ARI, we are not ready to renew; however, we do not rely solely on
// ARI calculations... what if there is a bug in our implementation, or in the
// server's, or the stored metadata? for redundancy, give credence to the expiration
// date; ignore ARI if we are past a "dangerously close" limit, to avoid any
// possibility of a bug in ARI compromising a site's uptime: we should always always
// always give heed to actual validity period
if currentlyInRenewalWindow(leaf.NotBefore, expiration, 1.0/20.0) {
logger.Warn("certificate is in emergency renewal window; superceding ARI",
zap.Duration("remaining", time.Until(expiration)),
zap.Time("renewal_cutoff", cutoff))
return true
}

}

// the normal check, in the absence of ARI, is to determine if we're near enough (or past)
// the expiration date based on the configured remaining:lifetime ratio
if currentlyInRenewalWindow(leaf.NotBefore, expiration, cfg.RenewalWindowRatio) {
logger.Info("certificate is in configured renewal window based on expiration date",
zap.Duration("remaining", time.Until(expiration)))
return true
}

// finally, if the certificate is expiring imminently, always attempt a renewal;
// we check both a (very low) lifetime ratio and also a strict difference between
// the time until expiration and the interval at which we run the standard maintenance
// routine to check for renewals, to accommodate both exceptionally long and short
// cert lifetimes
if currentlyInRenewalWindow(leaf.NotBefore, expiration, 1.0/50.0) ||
time.Until(expiration) < cfg.certCache.options.RenewCheckInterval*5 {
logger.Warn("certificate is in emergency renewal window; expiration imminent",
zap.Duration("remaining", time.Until(expiration)))
return true
}

return false
}

// Expired returns true if the certificate has expired.
Expand All @@ -85,10 +187,12 @@ func (cert Certificate) Expired() bool {
return time.Now().After(expiresAt(cert.Leaf))
}

// currentlyInRenewalWindow returns true if the current time is
// within the renewal window, according to the given start/end
// currentlyInRenewalWindow returns true if the current time is within
// (or after) the renewal window, according to the given start/end
// dates and the ratio of the renewal window. If true is returned,
// the certificate being considered is due for renewal.
// the certificate being considered is due for renewal. The ratio
// is remaining:total time, i.e. 1/3 = 1/3 of lifetime remaining,
// or 9/10 = 9/10 of time lifetime remaining.
func currentlyInRenewalWindow(notBefore, notAfter time.Time, renewalWindowRatio float64) bool {
if notAfter.IsZero() {
return false
Expand Down Expand Up @@ -154,9 +258,37 @@ func (cfg *Config) loadManagedCertificate(ctx context.Context, domain string) (C
}
cert.managed = true
cert.issuerKey = certRes.issuerKey
if ari, err := certRes.getARI(); err == nil && ari != nil {
cert.ari = *ari
}
return cert, nil
}

// getARI unpacks ACME Renewal Information from the issuer data, if available.
// It is only an error if there is invalid JSON.
func (certRes CertificateResource) getARI() (*acme.RenewalInfo, error) {
acmeData, err := certRes.getACMEData()
if err != nil {
return nil, err
}
return acmeData.RenewalInfo, nil
}

// getACMEData returns the ACME certificate metadata from the IssuerData, but
// note that a non-ACME-issued certificate may return an empty value and nil
// since the JSON may still decode successfully but just not match any or all
// of the fields. Remember that the IssuerKey is used to store and access the
// cert files in the first place (it is part of the path) so in theory if you
// load a CertificateResource from an ACME issuer it should work as expected.
func (certRes CertificateResource) getACMEData() (acme.Certificate, error) {
if len(certRes.IssuerData) == 0 {
return acme.Certificate{}, nil
}
var acmeCert acme.Certificate
err := json.Unmarshal(certRes.IssuerData, &acmeCert)
return acmeCert, err
}

// CacheUnmanagedCertificatePEMFile loads a certificate for host using certFile
// and keyFile, which must be in PEM format. It stores the certificate in
// the in-memory cache and returns the hash, useful for removing from the cache.
Expand Down Expand Up @@ -329,21 +461,22 @@ func fillCertFromLeaf(cert *Certificate, tlsCert tls.Certificate) error {
return nil
}

// managedCertInStorageExpiresSoon returns true if cert (being a
// managed certificate) is expiring within RenewDurationBefore.
// It returns false if there was an error checking the expiration
// of the certificate as found in storage, or if the certificate
// in storage is NOT expiring soon. A certificate that is expiring
// managedCertInStorageNeedsRenewal returns true if cert (being a
// managed certificate) is expiring soon (according to cfg) or if
// ACME Renewal Information (ARI) is available and says that it is
// time to renew (it uses existing ARI; it does not update it).
// It returns false if there was an error, the cert is not expiring
// soon, and ARI window is still future. A certificate that is expiring
// soon in our cache but is not expiring soon in storage probably
// means that another instance renewed the certificate in the
// meantime, and it would be a good idea to simply load the cert
// into our cache rather than repeating the renewal process again.
func (cfg *Config) managedCertInStorageExpiresSoon(ctx context.Context, cert Certificate) (bool, error) {
func (cfg *Config) managedCertInStorageNeedsRenewal(ctx context.Context, cert Certificate) (bool, error) {
certRes, err := cfg.loadCertResourceAnyIssuer(ctx, cert.Names[0])
if err != nil {
return false, err
}
_, needsRenew := cfg.managedCertNeedsRenewal(certRes)
_, _, needsRenew := cfg.managedCertNeedsRenewal(certRes, false)
return needsRenew, nil
}

Expand Down

0 comments on commit 0e88b3e

Please sign in to comment.