From 0e88b3eaa1afb128a287df85a4ffdc0b60b55940 Mon Sep 17 00:00:00 2001 From: Matt Holt Date: Tue, 7 May 2024 09:46:03 -0600 Subject: [PATCH] Initial implementation of ARI (#286) * Initial implementation of ARI * Enhance redundancy, robustness, and logging * Improve ARI updating; integrate on-demand TLS; detect changed window --- acmeclient.go | 77 ++++++++++------- acmeissuer.go | 34 ++++++-- certificates.go | 159 +++++++++++++++++++++++++++++++--- certmagic.go | 6 +- config.go | 51 +++++++++-- config_test.go | 25 ++++-- go.mod | 2 +- go.sum | 4 +- handshake.go | 43 +++++---- maintain.go | 221 +++++++++++++++++++++++++++++++++++++++++++++-- zerosslissuer.go | 6 +- 11 files changed, 528 insertions(+), 100 deletions(-) diff --git a/acmeclient.go b/acmeclient.go index 1727f1eb..8d7888f2 100644 --- a/acmeclient.go +++ b/acmeclient.go @@ -137,44 +137,21 @@ func (iss *ACMEIssuer) newACMEClientWithAccount(ctx context.Context, useTestCA, // independent of any particular ACME account. If useTestCA is true, am.TestCA // will be used if it is set; otherwise, the primary CA will be used. func (iss *ACMEIssuer) newACMEClient(useTestCA bool) (*acmez.Client, error) { - // ensure defaults are filled in - var caURL string - if useTestCA { - caURL = iss.TestCA - } - if caURL == "" { - caURL = iss.CA + client, err := iss.newBasicACMEClient() + if err != nil { + return nil, err } - if caURL == "" { - caURL = DefaultACME.CA + + // fill in a little more beyond a basic client + if useTestCA && iss.TestCA != "" { + client.Client.Directory = iss.TestCA } certObtainTimeout := iss.CertObtainTimeout if certObtainTimeout == 0 { certObtainTimeout = DefaultACME.CertObtainTimeout } - - // ensure endpoint is secure (assume HTTPS if scheme is missing) - if !strings.Contains(caURL, "://") { - caURL = "https://" + caURL - } - u, err := url.Parse(caURL) - if err != nil { - return nil, err - } - if u.Scheme != "https" && !SubjectIsInternal(u.Host) { - return nil, fmt.Errorf("%s: insecure CA URL (HTTPS required for non-internal CA)", caURL) - } - - client := &acmez.Client{ - Client: &acme.Client{ - Directory: caURL, - PollTimeout: certObtainTimeout, - UserAgent: buildUAString(), - HTTPClient: iss.httpClient, - }, - ChallengeSolvers: make(map[string]acmez.Solver), - } - client.Logger = iss.Logger.Named("acme_client") + client.Client.PollTimeout = certObtainTimeout + client.ChallengeSolvers = make(map[string]acmez.Solver) // configure challenges (most of the time, DNS challenge is // exclusive of other ones because it is usually only used @@ -230,6 +207,42 @@ func (iss *ACMEIssuer) newACMEClient(useTestCA bool) (*acmez.Client, error) { return client, nil } +// newBasicACMEClient sets up a basically-functional ACME client that is not capable +// of solving challenges but can provide basic interactions with the server. +func (iss *ACMEIssuer) newBasicACMEClient() (*acmez.Client, error) { + caURL := iss.CA + if caURL == "" { + caURL = DefaultACME.CA + } + // ensure endpoint is secure (assume HTTPS if scheme is missing) + if !strings.Contains(caURL, "://") { + caURL = "https://" + caURL + } + u, err := url.Parse(caURL) + if err != nil { + return nil, err + } + if u.Scheme != "https" && !SubjectIsInternal(u.Host) { + return nil, fmt.Errorf("%s: insecure CA URL (HTTPS required for non-internal CA)", caURL) + } + return &acmez.Client{ + Client: &acme.Client{ + Directory: caURL, + UserAgent: buildUAString(), + HTTPClient: iss.httpClient, + Logger: iss.Logger.Named("acme_client"), + }, + }, nil +} + +func (iss *ACMEIssuer) getRenewalInfo(ctx context.Context, cert Certificate) (acme.RenewalInfo, error) { + acmeClient, err := iss.newBasicACMEClient() + if err != nil { + return acme.RenewalInfo{}, err + } + return acmeClient.GetRenewalInfo(ctx, cert.Certificate.Leaf) +} + func (iss *ACMEIssuer) getHTTPPort() int { useHTTPPort := HTTPChallengePort if HTTPPort > 0 && HTTPPort != HTTPChallengePort { diff --git a/acmeissuer.go b/acmeissuer.go index fa537e11..87fa5ff5 100644 --- a/acmeissuer.go +++ b/acmeissuer.go @@ -362,12 +362,13 @@ func (am *ACMEIssuer) Issue(ctx context.Context, csr *x509.CertificateRequest) ( panic("missing config pointer (must use NewACMEIssuer)") } - var isRetry bool - if attempts, ok := ctx.Value(AttemptsCtxKey).(*int); ok { - isRetry = *attempts > 0 + var attempts int + if attemptsPtr, ok := ctx.Value(AttemptsCtxKey).(*int); ok { + attempts = *attemptsPtr } + isRetry := attempts > 0 - cert, usedTestCA, err := am.doIssue(ctx, csr, isRetry) + cert, usedTestCA, err := am.doIssue(ctx, csr, attempts) if err != nil { return nil, err } @@ -395,7 +396,7 @@ func (am *ACMEIssuer) Issue(ctx context.Context, csr *x509.CertificateRequest) ( // other endpoint. This is more likely to happen if a user is testing with // the staging CA as the main CA, then changes their configuration once they // think they are ready for the production endpoint. - cert, _, err = am.doIssue(ctx, csr, false) + cert, _, err = am.doIssue(ctx, csr, 0) if err != nil { // succeeded with test CA but failed just now with the production CA; // either we are observing differing internal states of each CA that will @@ -423,7 +424,8 @@ func (am *ACMEIssuer) Issue(ctx context.Context, csr *x509.CertificateRequest) ( return cert, err } -func (am *ACMEIssuer) doIssue(ctx context.Context, csr *x509.CertificateRequest, useTestCA bool) (*IssuedCertificate, bool, error) { +func (am *ACMEIssuer) doIssue(ctx context.Context, csr *x509.CertificateRequest, attempts int) (*IssuedCertificate, bool, error) { + useTestCA := attempts > 0 client, err := am.newACMEClientWithAccount(ctx, useTestCA, false) if err != nil { return nil, false, err @@ -449,6 +451,22 @@ func (am *ACMEIssuer) doIssue(ctx context.Context, csr *x509.CertificateRequest, params.NotAfter = time.Now().Add(am.NotAfter) } + // Notify the ACME server we are replacing a certificate (if the caller says we are), + // only if the following conditions are met: + // - The caller has set a Replaces value in the context, indicating this is a renewal. + // - Not using test CA. This should be obvious, but a test CA should be in a separate + // environment from production, and thus not have knowledge of the cert being replaced. + // - Not a certain attempt number. We skip setting Replaces once early on in the retries + // in case the reason the order is failing is only because there is a state inconsistency + // between client and server or some sort of bookkeeping error with regards to the certID + // and the server is rejecting the ARI certID. In any case, an invalid certID may cause + // orders to fail. So try once without setting it. + if !usingTestCA && attempts != 2 { + if replacing, ok := ctx.Value(ctxKeyARIReplaces).(*x509.Certificate); ok { + params.Replaces = replacing + } + } + // do this in a loop because there's an error case that may necessitate a retry, but not more than once var certChains []acme.Certificate for i := 0; i < 2; i++ { @@ -631,6 +649,10 @@ const ( // prefixACME is the storage key prefix used for ACME-specific assets. const prefixACME = "acme" +type ctxKey string + +const ctxKeyARIReplaces = ctxKey("ari_replaces") + // Interface guards var ( _ PreChecker = (*ACMEIssuer)(nil) diff --git a/certificates.go b/certificates.go index 8540f233..a5147a2d 100644 --- a/certificates.go +++ b/certificates.go @@ -18,12 +18,15 @@ import ( "context" "crypto/tls" "crypto/x509" + "encoding/json" "fmt" + "math/rand" "net" "os" "strings" "time" + "github.com/mholt/acmez/v2/acme" "go.uber.org/zap" "golang.org/x/crypto/ocsp" ) @@ -56,6 +59,9 @@ type Certificate struct { // The unique string identifying the issuer of this certificate. issuerKey string + + // ACME Renewal Information, if available + ari acme.RenewalInfo } // Empty returns true if the certificate struct is not filled out; at @@ -67,10 +73,106 @@ func (cert Certificate) Empty() bool { // Hash returns a checksum of the certificate chain's DER-encoded bytes. func (cert Certificate) Hash() string { return cert.hash } -// NeedsRenewal returns true if the certificate is -// expiring soon (according to cfg) or has expired. +// NeedsRenewal returns true if the certificate is expiring +// soon (according to ARI and/or cfg) or has expired. func (cert Certificate) NeedsRenewal(cfg *Config) bool { - return currentlyInRenewalWindow(cert.Leaf.NotBefore, expiresAt(cert.Leaf), cfg.RenewalWindowRatio) + return cfg.certNeedsRenewal(cert.Leaf, cert.ari, true) +} + +// certNeedsRenewal consults ACME Renewal Info (ARI) and certificate expiration to determine +// whether the leaf certificate needs to be renewed yet. If true is returned, the certificate +// should be renewed as soon as possible. The reasoning for a true return value is logged +// unless emitLogs is false; this can be useful to suppress noisy logs in the case where you +// first call this to determine if a cert in memory needs renewal, and then right after you +// call it again to see if the cert in storage still needs renewal -- you probably don't want +// to log the second time for checking the cert in storage which is mainly for synchronization. +func (cfg *Config) certNeedsRenewal(leaf *x509.Certificate, ari acme.RenewalInfo, emitLogs bool) bool { + expiration := expiresAt(leaf) + + var logger *zap.Logger + if emitLogs { + logger = cfg.Logger.With( + zap.Strings("subjects", leaf.DNSNames), + zap.Time("expiration", expiration), + zap.String("ari_cert_id", ari.UniqueIdentifier), + zap.Timep("next_ari_update", ari.RetryAfter), + zap.Duration("renew_check_interval", cfg.certCache.options.RenewCheckInterval), + zap.Time("window_start", ari.SuggestedWindow.Start), + zap.Time("window_end", ari.SuggestedWindow.End)) + } else { + logger = zap.NewNop() + } + + // first check ARI: if it says it's time to renew, it's time to renew + // (notice that we don't strictly require an ARI window to also exist; we presume + // that if a time has been selected, a window does or did exist, even if it didn't + // get stored/encoded for some reason - but also: this allows administrators to + // manually or explicitly schedule a renewal time indepedently of ARI which could + // be useful) + selectedTime := ari.SelectedTime + + // if, for some reason a random time in the window hasn't been selected yet, but an ARI + // window does exist, we can always improvise one... even if this is called repeatedly, + // a random time is a random time, whether you generate it once or more :D + // (code borrowed from our acme package) + if selectedTime.IsZero() && + (!ari.SuggestedWindow.Start.IsZero() && !ari.SuggestedWindow.End.IsZero()) { + start, end := ari.SuggestedWindow.Start.Unix()+1, ari.SuggestedWindow.End.Unix() + selectedTime = time.Unix(rand.Int63n(end-start)+start, 0).UTC() + logger.Warn("no renewal time had been selected with ARI; chose an ephemeral one for now", + zap.Time("ephemeral_selected_time", selectedTime)) + } + + // if a renewal time has been selected, start with that + if !selectedTime.IsZero() { + // ARI spec recommends an algorithm that renews after the randomly-selected + // time OR just before it if the next waking time would be after it; this + // cutoff can actually be before the start of the renewal window, but the spec + // author says that's OK: https://github.com/aarongable/draft-acme-ari/issues/71 + cutoff := ari.SelectedTime.Add(-cfg.certCache.options.RenewCheckInterval) + if time.Now().After(cutoff) { + logger.Info("certificate needs renewal based on ARI window", + zap.Time("selected_time", selectedTime), + zap.Time("renewal_cutoff", cutoff)) + return true + } + + // according to ARI, we are not ready to renew; however, we do not rely solely on + // ARI calculations... what if there is a bug in our implementation, or in the + // server's, or the stored metadata? for redundancy, give credence to the expiration + // date; ignore ARI if we are past a "dangerously close" limit, to avoid any + // possibility of a bug in ARI compromising a site's uptime: we should always always + // always give heed to actual validity period + if currentlyInRenewalWindow(leaf.NotBefore, expiration, 1.0/20.0) { + logger.Warn("certificate is in emergency renewal window; superceding ARI", + zap.Duration("remaining", time.Until(expiration)), + zap.Time("renewal_cutoff", cutoff)) + return true + } + + } + + // the normal check, in the absence of ARI, is to determine if we're near enough (or past) + // the expiration date based on the configured remaining:lifetime ratio + if currentlyInRenewalWindow(leaf.NotBefore, expiration, cfg.RenewalWindowRatio) { + logger.Info("certificate is in configured renewal window based on expiration date", + zap.Duration("remaining", time.Until(expiration))) + return true + } + + // finally, if the certificate is expiring imminently, always attempt a renewal; + // we check both a (very low) lifetime ratio and also a strict difference between + // the time until expiration and the interval at which we run the standard maintenance + // routine to check for renewals, to accommodate both exceptionally long and short + // cert lifetimes + if currentlyInRenewalWindow(leaf.NotBefore, expiration, 1.0/50.0) || + time.Until(expiration) < cfg.certCache.options.RenewCheckInterval*5 { + logger.Warn("certificate is in emergency renewal window; expiration imminent", + zap.Duration("remaining", time.Until(expiration))) + return true + } + + return false } // Expired returns true if the certificate has expired. @@ -85,10 +187,12 @@ func (cert Certificate) Expired() bool { return time.Now().After(expiresAt(cert.Leaf)) } -// currentlyInRenewalWindow returns true if the current time is -// within the renewal window, according to the given start/end +// currentlyInRenewalWindow returns true if the current time is within +// (or after) the renewal window, according to the given start/end // dates and the ratio of the renewal window. If true is returned, -// the certificate being considered is due for renewal. +// the certificate being considered is due for renewal. The ratio +// is remaining:total time, i.e. 1/3 = 1/3 of lifetime remaining, +// or 9/10 = 9/10 of time lifetime remaining. func currentlyInRenewalWindow(notBefore, notAfter time.Time, renewalWindowRatio float64) bool { if notAfter.IsZero() { return false @@ -154,9 +258,37 @@ func (cfg *Config) loadManagedCertificate(ctx context.Context, domain string) (C } cert.managed = true cert.issuerKey = certRes.issuerKey + if ari, err := certRes.getARI(); err == nil && ari != nil { + cert.ari = *ari + } return cert, nil } +// getARI unpacks ACME Renewal Information from the issuer data, if available. +// It is only an error if there is invalid JSON. +func (certRes CertificateResource) getARI() (*acme.RenewalInfo, error) { + acmeData, err := certRes.getACMEData() + if err != nil { + return nil, err + } + return acmeData.RenewalInfo, nil +} + +// getACMEData returns the ACME certificate metadata from the IssuerData, but +// note that a non-ACME-issued certificate may return an empty value and nil +// since the JSON may still decode successfully but just not match any or all +// of the fields. Remember that the IssuerKey is used to store and access the +// cert files in the first place (it is part of the path) so in theory if you +// load a CertificateResource from an ACME issuer it should work as expected. +func (certRes CertificateResource) getACMEData() (acme.Certificate, error) { + if len(certRes.IssuerData) == 0 { + return acme.Certificate{}, nil + } + var acmeCert acme.Certificate + err := json.Unmarshal(certRes.IssuerData, &acmeCert) + return acmeCert, err +} + // CacheUnmanagedCertificatePEMFile loads a certificate for host using certFile // and keyFile, which must be in PEM format. It stores the certificate in // the in-memory cache and returns the hash, useful for removing from the cache. @@ -329,21 +461,22 @@ func fillCertFromLeaf(cert *Certificate, tlsCert tls.Certificate) error { return nil } -// managedCertInStorageExpiresSoon returns true if cert (being a -// managed certificate) is expiring within RenewDurationBefore. -// It returns false if there was an error checking the expiration -// of the certificate as found in storage, or if the certificate -// in storage is NOT expiring soon. A certificate that is expiring +// managedCertInStorageNeedsRenewal returns true if cert (being a +// managed certificate) is expiring soon (according to cfg) or if +// ACME Renewal Information (ARI) is available and says that it is +// time to renew (it uses existing ARI; it does not update it). +// It returns false if there was an error, the cert is not expiring +// soon, and ARI window is still future. A certificate that is expiring // soon in our cache but is not expiring soon in storage probably // means that another instance renewed the certificate in the // meantime, and it would be a good idea to simply load the cert // into our cache rather than repeating the renewal process again. -func (cfg *Config) managedCertInStorageExpiresSoon(ctx context.Context, cert Certificate) (bool, error) { +func (cfg *Config) managedCertInStorageNeedsRenewal(ctx context.Context, cert Certificate) (bool, error) { certRes, err := cfg.loadCertResourceAnyIssuer(ctx, cert.Names[0]) if err != nil { return false, err } - _, needsRenew := cfg.managedCertNeedsRenewal(certRes) + _, _, needsRenew := cfg.managedCertNeedsRenewal(certRes, false) return needsRenew, nil } diff --git a/certmagic.go b/certmagic.go index 719280d7..6b7be634 100644 --- a/certmagic.go +++ b/certmagic.go @@ -39,6 +39,7 @@ import ( "crypto" "crypto/tls" "crypto/x509" + "encoding/json" "fmt" "log" "net" @@ -388,7 +389,8 @@ type IssuedCertificate struct { Certificate []byte // Any extra information to serialize alongside the - // certificate in storage. + // certificate in storage. It MUST be serializable + // as JSON in order to be preserved. Metadata any } @@ -409,7 +411,7 @@ type CertificateResource struct { // Any extra information associated with the certificate, // usually provided by the issuer implementation. - IssuerData any `json:"issuer_data,omitempty"` + IssuerData json.RawMessage `json:"issuer_data,omitempty"` // The unique string identifying the issuer of the // certificate; internally useful for storage access. diff --git a/config.go b/config.go index d6ed61c2..5a9cf498 100644 --- a/config.go +++ b/config.go @@ -52,6 +52,7 @@ type Config struct { // it should be renewed; for most certificates, the // global default is good, but for extremely short- // lived certs, you may want to raise this to ~0.5. + // Ratio is remaining:total lifetime. RenewalWindowRatio float64 // An optional event callback clients can set @@ -446,6 +447,15 @@ func (cfg *Config) manageOne(ctx context.Context, domainName string, async bool) return err } + // ensure ARI is updated before we check whether the cert needs renewing + // (we ignore the second return value because we already check if needs renewing anyway) + if cert.ari.NeedsRefresh() { + cert, _, err = cfg.updateARI(ctx, cert, cfg.Logger) + if err != nil { + cfg.Logger.Error("updating ARI upon managing", zap.Error(err)) + } + } + // otherwise, simply renew the certificate if needed if cert.NeedsRenewal(cfg) { var err error @@ -639,11 +649,15 @@ func (cfg *Config) obtainCert(ctx context.Context, name string, interactive bool issuerKey := issuerUsed.IssuerKey() // success - immediately save the certificate resource + metaJSON, err := json.Marshal(issuedCert.Metadata) + if err != nil { + log.Error("unable to encode certificate metadata", zap.Error(err)) + } certRes := CertificateResource{ SANs: namesFromCSR(csr), CertificatePEM: issuedCert.Certificate, PrivateKeyPEM: privKeyPEM, - IssuerData: issuedCert.Metadata, + IssuerData: metaJSON, issuerKey: issuerUsed.IssuerKey(), } err = cfg.saveCertResource(ctx, issuerUsed, certRes) @@ -792,7 +806,7 @@ func (cfg *Config) renewCert(ctx context.Context, name string, force, interactiv } // check if renew is still needed - might have been renewed while waiting for lock - timeLeft, needsRenew := cfg.managedCertNeedsRenewal(certRes) + timeLeft, leaf, needsRenew := cfg.managedCertNeedsRenewal(certRes, false) if !needsRenew { if force { log.Info("certificate does not need to be renewed, but renewal is being forced", @@ -869,6 +883,18 @@ func (cfg *Config) renewCert(ctx context.Context, name string, force, interactiv } } + // if we're renewing with the same ACME CA as before, have the ACME + // client tell the server we are replacing a certificate (but doing + // this on the wrong CA, or when the CA doesn't recognize the certID, + // can fail the order) + if acmeData, err := certRes.getACMEData(); err == nil && acmeData.CA != "" { + if acmeIss, ok := issuer.(*ACMEIssuer); ok { + if acmeIss.CA == acmeData.CA { + ctx = context.WithValue(ctx, ctxKeyARIReplaces, leaf) + } + } + } + issuedCert, err = issuer.Issue(ctx, useCSR) if err == nil { issuerUsed = issuer @@ -902,11 +928,15 @@ func (cfg *Config) renewCert(ctx context.Context, name string, force, interactiv issuerKey := issuerUsed.IssuerKey() // success - immediately save the renewed certificate resource + metaJSON, err := json.Marshal(issuedCert.Metadata) + if err != nil { + log.Error("unable to encode certificate metadata", zap.Error(err)) + } newCertRes := CertificateResource{ SANs: namesFromCSR(csr), CertificatePEM: issuedCert.Certificate, PrivateKeyPEM: certRes.PrivateKeyPEM, - IssuerData: issuedCert.Metadata, + IssuerData: metaJSON, issuerKey: issuerKey, } err = cfg.saveCertResource(ctx, issuerUsed, newCertRes) @@ -1206,14 +1236,19 @@ func (cfg *Config) lockKey(op, domainName string) string { // managedCertNeedsRenewal returns true if certRes is expiring soon or already expired, // or if the process of decoding the cert and checking its expiration returned an error. -func (cfg *Config) managedCertNeedsRenewal(certRes CertificateResource) (time.Duration, bool) { +// If there wasn't an error, the leaf cert is also returned, so it can be reused if +// necessary, since we are parsing the PEM bundle anyway. +func (cfg *Config) managedCertNeedsRenewal(certRes CertificateResource, emitLogs bool) (time.Duration, *x509.Certificate, bool) { certChain, err := parseCertsFromPEMBundle(certRes.CertificatePEM) - if err != nil { - return 0, true + if err != nil || len(certChain) == 0 { + return 0, nil, true + } + var ari acme.RenewalInfo + if ariPtr, err := certRes.getARI(); err == nil && ariPtr != nil { + ari = *ariPtr } remaining := time.Until(expiresAt(certChain[0])) - needsRenew := currentlyInRenewalWindow(certChain[0].NotBefore, expiresAt(certChain[0]), cfg.RenewalWindowRatio) - return remaining, needsRenew + return remaining, certChain[0], cfg.certNeedsRenewal(certChain[0], ari, emitLogs) } func (cfg *Config) emit(ctx context.Context, eventName string, data map[string]any) error { diff --git a/config_test.go b/config_test.go index 06191590..e1e848e4 100644 --- a/config_test.go +++ b/config_test.go @@ -15,7 +15,9 @@ package certmagic import ( + "bytes" "context" + "encoding/json" "os" "reflect" "testing" @@ -51,9 +53,9 @@ func TestSaveCertResource(t *testing.T) { SANs: []string{domain}, PrivateKeyPEM: []byte(keyContents), CertificatePEM: []byte(certContents), - IssuerData: &acme.Certificate{ + IssuerData: mustJSON(acme.Certificate{ URL: "https://example.com/cert", - }, + }), issuerKey: am.IssuerKey(), } @@ -62,17 +64,22 @@ func TestSaveCertResource(t *testing.T) { t.Fatalf("Expected no error, got: %v", err) } - // the result of our test will be a map, since we have - // no choice but to decode it into an 'any' interface - cert.IssuerData = map[string]any{ - "url": "https://example.com/cert", - } - siteData, err := testConfig.loadCertResource(ctx, am, domain) if err != nil { t.Fatalf("Expected no error reading site, got: %v", err) } + siteData.IssuerData = bytes.ReplaceAll(siteData.IssuerData, []byte("\t"), []byte("")) + siteData.IssuerData = bytes.ReplaceAll(siteData.IssuerData, []byte("\n"), []byte("")) + siteData.IssuerData = bytes.ReplaceAll(siteData.IssuerData, []byte(" "), []byte("")) if !reflect.DeepEqual(cert, siteData) { - t.Errorf("Expected '%+v' to match '%+v'", cert, siteData) + t.Errorf("Expected '%+v' to match '%+v'\n%s\n%s", cert.IssuerData, siteData.IssuerData, string(cert.IssuerData), string(siteData.IssuerData)) + } +} + +func mustJSON(val any) []byte { + result, err := json.Marshal(val) + if err != nil { + panic("marshaling JSON: " + err.Error()) } + return result } diff --git a/go.mod b/go.mod index a10cc8a6..b09c00cc 100644 --- a/go.mod +++ b/go.mod @@ -8,7 +8,7 @@ require ( github.com/caddyserver/zerossl v0.1.2 github.com/klauspost/cpuid/v2 v2.2.7 github.com/libdns/libdns v0.2.2 - github.com/mholt/acmez/v2 v2.0.0 + github.com/mholt/acmez/v2 v2.0.1-0.20240506200913-5a16e768dea9 github.com/miekg/dns v1.1.58 github.com/zeebo/blake3 v0.2.3 go.uber.org/zap v1.27.0 diff --git a/go.sum b/go.sum index 9109c118..e679a722 100644 --- a/go.sum +++ b/go.sum @@ -7,8 +7,8 @@ github.com/klauspost/cpuid/v2 v2.2.7 h1:ZWSB3igEs+d0qvnxR/ZBzXVmxkgt8DdzP6m9pfuV github.com/klauspost/cpuid/v2 v2.2.7/go.mod h1:Lcz8mBdAVJIBVzewtcLocK12l3Y+JytZYpaMropDUws= github.com/libdns/libdns v0.2.2 h1:O6ws7bAfRPaBsgAYt8MDe2HcNBGC29hkZ9MX2eUSX3s= github.com/libdns/libdns v0.2.2/go.mod h1:4Bj9+5CQiNMVGf87wjX4CY3HQJypUHRuLvlsfsZqLWQ= -github.com/mholt/acmez/v2 v2.0.0 h1:FsGoEdA5mPhAaGfrwIEM13jnn2hn3jU6Vny+RWypt3o= -github.com/mholt/acmez/v2 v2.0.0/go.mod h1:fX4c9r5jYwMyMsC+7tkYRxHibkOTgta5DIFGoe67e1U= +github.com/mholt/acmez/v2 v2.0.1-0.20240506200913-5a16e768dea9 h1:b8yqT+RDihRjjBRq2FWtnduFHqPlpyrKM4ZH0UatgKM= +github.com/mholt/acmez/v2 v2.0.1-0.20240506200913-5a16e768dea9/go.mod h1:fX4c9r5jYwMyMsC+7tkYRxHibkOTgta5DIFGoe67e1U= github.com/miekg/dns v1.1.58 h1:ca2Hdkz+cDg/7eNF6V56jjzuZ4aCAE+DbVkILdQWG/4= github.com/miekg/dns v1.1.58/go.mod h1:Ypv+3b/KadlvW9vJfXOTf300O4UqaHFzFCuHz+rPkBY= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= diff --git a/handshake.go b/handshake.go index 49a6c271..bd23ac61 100644 --- a/handshake.go +++ b/handshake.go @@ -546,11 +546,11 @@ func (cfg *Config) obtainOnDemandCertificate(ctx context.Context, hello *tls.Cli // // This function is safe for use by multiple concurrent goroutines. func (cfg *Config) handshakeMaintenance(ctx context.Context, hello *tls.ClientHelloInfo, cert Certificate) (Certificate, error) { - log := cfg.Logger.Named("on_demand") + logger := cfg.Logger.Named("on_demand") // Check OCSP staple validity if cert.ocsp != nil && !freshOCSP(cert.ocsp) { - log.Debug("OCSP response needs refreshing", + logger.Debug("OCSP response needs refreshing", zap.Strings("identifiers", cert.Names), zap.Int("ocsp_status", cert.ocsp.Status), zap.Time("this_update", cert.ocsp.ThisUpdate), @@ -560,12 +560,12 @@ func (cfg *Config) handshakeMaintenance(ctx context.Context, hello *tls.ClientHe if err != nil { // An error with OCSP stapling is not the end of the world, and in fact, is // quite common considering not all certs have issuer URLs that support it. - log.Warn("stapling OCSP", + logger.Warn("stapling OCSP", zap.String("server_name", hello.ServerName), zap.Strings("sans", cert.Names), zap.Error(err)) } else { - log.Debug("successfully stapled new OCSP response", + logger.Debug("successfully stapled new OCSP response", zap.Strings("identifiers", cert.Names), zap.Int("ocsp_status", cert.ocsp.Status), zap.Time("this_update", cert.ocsp.ThisUpdate), @@ -578,10 +578,20 @@ func (cfg *Config) handshakeMaintenance(ctx context.Context, hello *tls.ClientHe cfg.certCache.mu.Unlock() } + // Check ARI status + if cert.ari.NeedsRefresh() { + // we ignore the second return value here because we go on to check renewal status below regardless + var err error + cert, _, err = cfg.updateARI(ctx, cert, logger) + if err != nil { + logger.Error("updated ARI", zap.Error(err)) + } + } + // We attempt to replace any certificates that were revoked. // Crucially, this happens OUTSIDE a lock on the certCache. if certShouldBeForceRenewed(cert) { - log.Warn("on-demand certificate's OCSP status is REVOKED; will try to forcefully renew", + logger.Warn("on-demand certificate's OCSP status is REVOKED; will try to forcefully renew", zap.Strings("identifiers", cert.Names), zap.Int("ocsp_status", cert.ocsp.Status), zap.Time("revoked_at", cert.ocsp.RevokedAt), @@ -591,14 +601,13 @@ func (cfg *Config) handshakeMaintenance(ctx context.Context, hello *tls.ClientHe } // Check cert expiration - if currentlyInRenewalWindow(cert.Leaf.NotBefore, expiresAt(cert.Leaf), cfg.RenewalWindowRatio) { + if cfg.certNeedsRenewal(cert.Leaf, cert.ari, true) { // Check if the certificate still exists on disk. If not, we need to obtain a new one. // This can happen if the certificate was cleaned up by the storage cleaner, but still // remains in the in-memory cache. if !cfg.storageHasCertResourcesAnyIssuer(ctx, cert.Names[0]) { - log.Debug("certificate not found on disk; obtaining new certificate", + logger.Debug("certificate not found on disk; obtaining new certificate", zap.Strings("identifiers", cert.Names)) - return cfg.obtainOnDemandCertificate(ctx, hello) } // Otherwise, renew the certificate. @@ -620,7 +629,7 @@ func (cfg *Config) handshakeMaintenance(ctx context.Context, hello *tls.ClientHe // // This function is safe for use by multiple concurrent goroutines. func (cfg *Config) renewDynamicCertificate(ctx context.Context, hello *tls.ClientHelloInfo, currentCert Certificate) (Certificate, error) { - log := logWithRemote(cfg.Logger.Named("on_demand"), hello) + logger := logWithRemote(cfg.Logger.Named("on_demand"), hello) name := cfg.getNameFromClientHello(hello) timeLeft := time.Until(expiresAt(currentCert.Leaf)) @@ -637,7 +646,7 @@ func (cfg *Config) renewDynamicCertificate(ctx context.Context, hello *tls.Clien // renewing it, so we might as well serve what we have without blocking, UNLESS // we're forcing renewal, in which case the current certificate is not usable if timeLeft > 0 && !revoked { - log.Debug("certificate expires soon but is already being renewed; serving current certificate", + logger.Debug("certificate expires soon but is already being renewed; serving current certificate", zap.Strings("subjects", currentCert.Names), zap.Duration("remaining", timeLeft)) return currentCert, nil @@ -646,7 +655,7 @@ func (cfg *Config) renewDynamicCertificate(ctx context.Context, hello *tls.Clien // otherwise, we'll have to wait for the renewal to finish so we don't serve // a revoked or expired certificate - log.Debug("certificate has expired, but is already being renewed; waiting for renewal to complete", + logger.Debug("certificate has expired, but is already being renewed; waiting for renewal to complete", zap.Strings("subjects", currentCert.Names), zap.Time("expired", expiresAt(currentCert.Leaf)), zap.Bool("revoked", revoked)) @@ -677,7 +686,7 @@ func (cfg *Config) renewDynamicCertificate(ctx context.Context, hello *tls.Clien obtainCertWaitChansMu.Unlock() } - log = log.With( + logger = logger.With( zap.String("server_name", name), zap.Strings("subjects", currentCert.Names), zap.Time("expiration", expiresAt(currentCert.Leaf)), @@ -698,19 +707,19 @@ func (cfg *Config) renewDynamicCertificate(ctx context.Context, hello *tls.Clien cfg.certCache.mu.Unlock() unblockWaiters() - if log != nil { - log.Error("certificate should not be obtained", zap.Error(err)) + if logger != nil { + logger.Error("certificate should not be obtained", zap.Error(err)) } return Certificate{}, err } - log.Info("attempting certificate renewal") + logger.Info("attempting certificate renewal") // otherwise, renew with issuer, etc. var newCert Certificate if revoked { - newCert, err = cfg.forceRenew(ctx, log, currentCert) + newCert, err = cfg.forceRenew(ctx, logger, currentCert) } else { err = cfg.RenewCertAsync(ctx, name, false) if err == nil { @@ -725,7 +734,7 @@ func (cfg *Config) renewDynamicCertificate(ctx context.Context, hello *tls.Clien unblockWaiters() if err != nil { - log.Error("renewing and reloading certificate", zap.String("server_name", name), zap.Error(err)) + logger.Error("renewing and reloading certificate", zap.String("server_name", name), zap.Error(err)) } return newCert, err diff --git a/maintain.go b/maintain.go index 57732802..848447bd 100644 --- a/maintain.go +++ b/maintain.go @@ -92,7 +92,7 @@ func (certCache *Cache) maintainAssets(panicCount int) { func (certCache *Cache) RenewManagedCertificates(ctx context.Context) error { log := certCache.logger.Named("maintenance") - // configs will hold a map of certificate name to the config + // configs will hold a map of certificate hash to the config // to use when managing that certificate configs := make(map[string]*Config) @@ -102,7 +102,7 @@ func (certCache *Cache) RenewManagedCertificates(ctx context.Context) error { // words, our first iteration through the certificate cache does NOT // perform any operations--only queues them--so that more fine-grained // write locks may be obtained during the actual operations. - var renewQueue, reloadQueue, deleteQueue []Certificate + var renewQueue, reloadQueue, deleteQueue, ariQueue certList certCache.mu.RLock() for certKey, cert := range certCache.cache { @@ -135,22 +135,28 @@ func (certCache *Cache) RenewManagedCertificates(ctx context.Context) error { continue } + // ACME-specific: see if if ACME Renewal Info (ARI) window needs refreshing + if cert.ari.NeedsRefresh() { + configs[cert.hash] = cfg + ariQueue = append(ariQueue, cert) + } + // if time is up or expires soon, we need to try to renew it if cert.NeedsRenewal(cfg) { - configs[cert.Names[0]] = cfg + configs[cert.hash] = cfg // see if the certificate in storage has already been renewed, possibly by another // instance that didn't coordinate with this one; if so, just load it (this // might happen if another instance already renewed it - kinda sloppy but checking disk // first is a simple way to possibly drastically reduce rate limit problems) - storedCertExpiring, err := cfg.managedCertInStorageExpiresSoon(ctx, cert) + storedCertNeedsRenew, err := cfg.managedCertInStorageNeedsRenewal(ctx, cert) if err != nil { // hmm, weird, but not a big deal, maybe it was deleted or something log.Warn("error while checking if stored certificate is also expiring soon", zap.Strings("identifiers", cert.Names), zap.Error(err)) - } else if !storedCertExpiring { - // if the certificate is NOT expiring soon and there was no error, then we + } else if !storedCertNeedsRenew { + // if the certificate does NOT need renewal and there was no error, then we // are good to just reload the certificate from storage instead of repeating // a likely-unnecessary renewal procedure reloadQueue = append(reloadQueue, cert) @@ -161,11 +167,30 @@ func (certCache *Cache) RenewManagedCertificates(ctx context.Context) error { // NOTE: It is super-important to note that the TLS-ALPN challenge requires // a write lock on the cache in order to complete its challenge, so it is extra // vital that this renew operation does not happen inside our read lock! - renewQueue = append(renewQueue, cert) + renewQueue.insert(cert) } } certCache.mu.RUnlock() + // Update ARI, and then for any certs where the ARI window changed, + // be sure to queue them for renewal if necessary + for _, cert := range ariQueue { + cfg := configs[cert.hash] + cert, changed, err := cfg.updateARI(ctx, cert, log) + if err != nil { + log.Error("updating ARI", zap.Error(err)) + } + if changed && cert.NeedsRenewal(cfg) { + // it's theoretically possible that another instance already got the memo + // on the changed ARI and even renewed the cert already, and thus doing it + // here is wasteful, but I have never heard of this happening in reality, + // so to save some cycles for now I think we'll just queue it for renewal + // (notice how we use 'insert' to avoid duplicates, in case it was already + // scheduled for renewal anyway) + renewQueue.insert(cert) + } + } + // Reload certificates that merely need to be updated in memory for _, oldCert := range reloadQueue { timeLeft := expiresAt(oldCert.Leaf).Sub(time.Now().UTC()) @@ -173,7 +198,7 @@ func (certCache *Cache) RenewManagedCertificates(ctx context.Context) error { zap.Strings("identifiers", oldCert.Names), zap.Duration("remaining", timeLeft)) - cfg := configs[oldCert.Names[0]] + cfg := configs[oldCert.hash] // crucially, this happens OUTSIDE a lock on the certCache _, err := cfg.reloadManagedCertificate(ctx, oldCert) @@ -187,7 +212,7 @@ func (certCache *Cache) RenewManagedCertificates(ctx context.Context) error { // Renewal queue for _, oldCert := range renewQueue { - cfg := configs[oldCert.Names[0]] + cfg := configs[oldCert.hash] err := certCache.queueRenewalTask(ctx, oldCert, cfg) if err != nil { log.Error("queueing renewal task", @@ -390,6 +415,171 @@ func (certCache *Cache) updateOCSPStaples(ctx context.Context) { } } +// storageHasNewerARI returns true if the configured storage has ARI that is newer +// than that of a certificate that is already loaded, along with the value from +// storage. +func (cfg *Config) storageHasNewerARI(ctx context.Context, cert Certificate) (bool, acme.RenewalInfo, error) { + storedCertData, err := cfg.loadStoredACMECertificateMetadata(ctx, cert) + if err != nil || storedCertData.RenewalInfo == nil { + return false, acme.RenewalInfo{}, err + } + // prefer stored info if it has a window and the loaded one doesn't, + // or if the one in storage has a later RetryAfter (though I suppose + // it's not guaranteed, typically those will move forward in time) + if (!cert.ari.HasWindow() && storedCertData.RenewalInfo.HasWindow()) || + storedCertData.RenewalInfo.RetryAfter.After(*cert.ari.RetryAfter) { + return true, *storedCertData.RenewalInfo, nil + } + return false, acme.RenewalInfo{}, nil +} + +// loadStoredACMECertificateMetadata loads the stored ACME certificate data +// from the cert's sidecar JSON file. +func (cfg *Config) loadStoredACMECertificateMetadata(ctx context.Context, cert Certificate) (acme.Certificate, error) { + metaBytes, err := cfg.Storage.Load(ctx, StorageKeys.SiteMeta(cert.issuerKey, cert.Names[0])) + if err != nil { + return acme.Certificate{}, fmt.Errorf("loading cert metadata: %w", err) + } + + var certRes CertificateResource + if err = json.Unmarshal(metaBytes, &certRes); err != nil { + return acme.Certificate{}, fmt.Errorf("unmarshaling cert metadata: %w", err) + } + + var acmeCert acme.Certificate + if err = json.Unmarshal(certRes.IssuerData, &acmeCert); err != nil { + return acme.Certificate{}, fmt.Errorf("unmarshaling potential ACME issuer metadata: %v", err) + } + + return acmeCert, nil +} + +// updateARI updates the cert's ACME renewal info, first by checking storage for a newer +// one, or getting it from the CA if needed. The updated info is stored in storage and +// updated in the cache. The certificate with the updated ARI is returned. If true is +// returned, the ARI window or selected time has changed, and the caller should check if +// the cert needs to be renewed now, even if there is an error. +func (cfg *Config) updateARI(ctx context.Context, cert Certificate, logger *zap.Logger) (updatedCert Certificate, changed bool, err error) { + logger = logger.With( + zap.Strings("identifiers", cert.Names), + zap.String("cert_hash", cert.hash), + zap.String("ari_unique_id", cert.ari.UniqueIdentifier), + zap.Time("cert_expiry", cert.Leaf.NotAfter)) + + updatedCert = cert + oldARI := cert.ari + + // see if the stored value has been refreshed already by another instance + gotNewARI, newARI, err := cfg.storageHasNewerARI(ctx, cert) + + // when we're all done, log if something about the schedule is different + // ("WARN" level because ARI window changing may be a sign of external trouble + // and we want to draw their attention to a potential explanation URL) + defer func() { + changed = !newARI.SameWindow(oldARI) + + if changed { + logger.Warn("ARI window or selected renewal time changed", + zap.Time("prev_start", oldARI.SuggestedWindow.Start), + zap.Time("next_start", newARI.SuggestedWindow.Start), + zap.Time("prev_end", oldARI.SuggestedWindow.End), + zap.Time("next_end", newARI.SuggestedWindow.End), + zap.Time("prev_selected_time", oldARI.SelectedTime), + zap.Time("next_selected_time", newARI.SelectedTime), + zap.String("explanation_url", newARI.ExplanationURL)) + } + }() + + if err == nil && gotNewARI { + // great, storage has a newer one we can use + cfg.certCache.mu.Lock() + updatedCert = cfg.certCache.cache[cert.hash] + updatedCert.ari = newARI + cfg.certCache.cache[cert.hash] = updatedCert + cfg.certCache.mu.Unlock() + logger.Info("reloaded ARI with newer one in storage", + zap.Timep("next_refresh", newARI.RetryAfter), + zap.Time("renewal_time", newARI.SelectedTime)) + return + } + + if err != nil { + logger.Error("error while checking storage for updated ARI; updating ARI now", zap.Error(err)) + } + + // of the issuers configured, hopefully one of them is the ACME CA we got the cert from + for _, iss := range cfg.Issuers { + if acmeIss, ok := iss.(*ACMEIssuer); ok { + newARI, err = acmeIss.getRenewalInfo(ctx, cert) // be sure to use existing newARI variable so we can compare against old value in the defer + if err != nil { + // could be anything, but a common error might simply be the "wrong" ACME CA + // (meaning, different from the one that issued the cert, thus the only one + // that would have any ARI for it) if multiple ACME CAs are configured + logger.Error("failed updating renewal info from ACME CA", + zap.String("issuer", iss.IssuerKey()), + zap.Error(err)) + continue + } + + // when we get the latest ARI, the acme package will select a time within the window + // for us; of course, since it's random, it's likely different from the previously- + // selected time; but if the window doesn't change, there's no need to change the + // selected time (the acme package doesn't know the previous window to know better) + // ... so if the window hasn't changed we'll just put back the selected time + if newARI.SameWindow(oldARI) && !oldARI.SelectedTime.IsZero() { + newARI.SelectedTime = oldARI.SelectedTime + } + + // then store the updated ARI (even if the window didn't change, the Retry-After + // likely did) in cache and storage + + // be sure we get the cert from the cache while inside a lock to avoid logical races + cfg.certCache.mu.Lock() + updatedCert = cfg.certCache.cache[cert.hash] + updatedCert.ari = newARI + cfg.certCache.cache[cert.hash] = updatedCert + cfg.certCache.mu.Unlock() + + // update the ARI value in storage + var certData acme.Certificate + certData, err = cfg.loadStoredACMECertificateMetadata(ctx, cert) + if err != nil { + err = fmt.Errorf("got new ARI from %s, but failed loading stored certificate metadata: %v", iss.IssuerKey(), err) + return + } + certData.RenewalInfo = &newARI + var certDataBytes, certResBytes []byte + certDataBytes, err = json.Marshal(certData) + if err != nil { + err = fmt.Errorf("got new ARI from %s, but failed marshaling certificate ACME metadata: %v", iss.IssuerKey(), err) + return + } + certResBytes, err = json.MarshalIndent(CertificateResource{ + SANs: cert.Names, + IssuerData: certDataBytes, + }, "", "\t") + if err != nil { + err = fmt.Errorf("got new ARI from %s, but could not re-encode certificate metadata: %v", iss.IssuerKey(), err) + return + } + if err = cfg.Storage.Store(ctx, StorageKeys.SiteMeta(cert.issuerKey, cert.Names[0]), certResBytes); err != nil { + err = fmt.Errorf("got new ARI from %s, but could not store it with certificate metadata: %v", iss.IssuerKey(), err) + return + } + + logger.Info("updated ACME renewal information", + zap.Time("selected_time", newARI.SelectedTime), + zap.Timep("next_update", newARI.RetryAfter), + zap.String("explanation_url", newARI.ExplanationURL)) + + return + } + } + + err = fmt.Errorf("could not fully update ACME renewal info: either no ACME issuer configured for certificate, or all failed (make sure the ACME CA that issued the certificate is configured)") + return +} + // CleanStorageOptions specifies how to clean up a storage unit. type CleanStorageOptions struct { // Optional custom logger. @@ -725,6 +915,19 @@ func certShouldBeForceRenewed(cert Certificate) bool { cert.ocsp.Status == ocsp.Revoked } +type certList []Certificate + +// insert appends cert to the list if it is not already in the list. +// Efficiency: O(n) +func (certs *certList) insert(cert Certificate) { + for _, c := range *certs { + if c.hash == cert.hash { + return + } + } + *certs = append(*certs, cert) +} + const ( // DefaultRenewCheckInterval is how often to check certificates for expiration. // Scans are very lightweight, so this can be semi-frequent. This default should diff --git a/zerosslissuer.go b/zerosslissuer.go index 5ab5cd0a..b9ffa7d7 100644 --- a/zerosslissuer.go +++ b/zerosslissuer.go @@ -258,7 +258,11 @@ func (iss *ZeroSSLIssuer) Revoke(ctx context.Context, cert CertificateResource, default: return fmt.Errorf("unsupported reason: %d", reason) } - return iss.getClient().RevokeCertificate(ctx, cert.IssuerData.(zerossl.CertificateObject).ID, r) + var certObj zerossl.CertificateObject + if err := json.Unmarshal(cert.IssuerData, &certObj); err != nil { + return err + } + return iss.getClient().RevokeCertificate(ctx, certObj.ID, r) } func (iss *ZeroSSLIssuer) getDistributedValidationInfo(ctx context.Context, identifier string) (acme.Challenge, bool, error) {