-
-
Notifications
You must be signed in to change notification settings - Fork 293
/
maintain.go
946 lines (839 loc) · 32.6 KB
/
maintain.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
// Copyright 2015 Matthew Holt
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package certmagic
import (
"context"
"crypto/x509"
"encoding/json"
"encoding/pem"
"errors"
"fmt"
"io/fs"
"path"
"runtime"
"strings"
"time"
"github.com/mholt/acmez/v2/acme"
"go.uber.org/zap"
"golang.org/x/crypto/ocsp"
)
// maintainAssets is a permanently-blocking function
// that loops indefinitely and, on a regular schedule, checks
// certificates for expiration and initiates a renewal of certs
// that are expiring soon. It also updates OCSP stapling. It
// should only be called once per cache. Panics are recovered,
// and if panicCount < 10, the function is called recursively,
// incrementing panicCount each time. Initial invocation should
// start panicCount at 0.
func (certCache *Cache) maintainAssets(panicCount int) {
log := certCache.logger.Named("maintenance")
log = log.With(zap.String("cache", fmt.Sprintf("%p", certCache)))
defer func() {
if err := recover(); err != nil {
buf := make([]byte, stackTraceBufferSize)
buf = buf[:runtime.Stack(buf, false)]
log.Error("panic", zap.Any("error", err), zap.ByteString("stack", buf))
if panicCount < 10 {
certCache.maintainAssets(panicCount + 1)
}
}
}()
certCache.optionsMu.RLock()
renewalTicker := time.NewTicker(certCache.options.RenewCheckInterval)
ocspTicker := time.NewTicker(certCache.options.OCSPCheckInterval)
certCache.optionsMu.RUnlock()
log.Info("started background certificate maintenance")
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
for {
select {
case <-renewalTicker.C:
err := certCache.RenewManagedCertificates(ctx)
if err != nil {
log.Error("renewing managed certificates", zap.Error(err))
}
case <-ocspTicker.C:
certCache.updateOCSPStaples(ctx)
case <-certCache.stopChan:
renewalTicker.Stop()
ocspTicker.Stop()
log.Info("stopped background certificate maintenance")
close(certCache.doneChan)
return
}
}
}
// RenewManagedCertificates renews managed certificates,
// including ones loaded on-demand. Note that this is done
// automatically on a regular basis; normally you will not
// need to call this. This method assumes non-interactive
// mode (i.e. operating in the background).
func (certCache *Cache) RenewManagedCertificates(ctx context.Context) error {
log := certCache.logger.Named("maintenance")
// configs will hold a map of certificate hash to the config
// to use when managing that certificate
configs := make(map[string]*Config)
// we use the queues for a very important reason: to do any and all
// operations that could require an exclusive write lock outside
// of the read lock! otherwise we get a deadlock, yikes. in other
// words, our first iteration through the certificate cache does NOT
// perform any operations--only queues them--so that more fine-grained
// write locks may be obtained during the actual operations.
var renewQueue, reloadQueue, deleteQueue, ariQueue certList
certCache.mu.RLock()
for certKey, cert := range certCache.cache {
if !cert.managed {
continue
}
// the list of names on this cert should never be empty... programmer error?
if cert.Names == nil || len(cert.Names) == 0 {
log.Warn("certificate has no names; removing from cache", zap.String("cert_key", certKey))
deleteQueue = append(deleteQueue, cert)
continue
}
// get the config associated with this certificate
cfg, err := certCache.getConfig(cert)
if err != nil {
log.Error("unable to get configuration to manage certificate; unable to renew",
zap.Strings("identifiers", cert.Names),
zap.Error(err))
continue
}
if cfg == nil {
// this is bad if this happens, probably a programmer error (oops)
log.Error("no configuration associated with certificate; unable to manage",
zap.Strings("identifiers", cert.Names))
continue
}
if cfg.OnDemand != nil {
continue
}
// ACME-specific: see if if ACME Renewal Info (ARI) window needs refreshing
if cert.ari.NeedsRefresh() {
configs[cert.hash] = cfg
ariQueue = append(ariQueue, cert)
}
// if time is up or expires soon, we need to try to renew it
if cert.NeedsRenewal(cfg) {
configs[cert.hash] = cfg
// see if the certificate in storage has already been renewed, possibly by another
// instance that didn't coordinate with this one; if so, just load it (this
// might happen if another instance already renewed it - kinda sloppy but checking disk
// first is a simple way to possibly drastically reduce rate limit problems)
storedCertNeedsRenew, err := cfg.managedCertInStorageNeedsRenewal(ctx, cert)
if err != nil {
// hmm, weird, but not a big deal, maybe it was deleted or something
log.Warn("error while checking if stored certificate is also expiring soon",
zap.Strings("identifiers", cert.Names),
zap.Error(err))
} else if !storedCertNeedsRenew {
// if the certificate does NOT need renewal and there was no error, then we
// are good to just reload the certificate from storage instead of repeating
// a likely-unnecessary renewal procedure
reloadQueue = append(reloadQueue, cert)
continue
}
// the certificate in storage has not been renewed yet, so we will do it
// NOTE: It is super-important to note that the TLS-ALPN challenge requires
// a write lock on the cache in order to complete its challenge, so it is extra
// vital that this renew operation does not happen inside our read lock!
renewQueue.insert(cert)
}
}
certCache.mu.RUnlock()
// Update ARI, and then for any certs where the ARI window changed,
// be sure to queue them for renewal if necessary
for _, cert := range ariQueue {
cfg := configs[cert.hash]
cert, changed, err := cfg.updateARI(ctx, cert, log)
if err != nil {
log.Error("updating ARI", zap.Error(err))
}
if changed && cert.NeedsRenewal(cfg) {
// it's theoretically possible that another instance already got the memo
// on the changed ARI and even renewed the cert already, and thus doing it
// here is wasteful, but I have never heard of this happening in reality,
// so to save some cycles for now I think we'll just queue it for renewal
// (notice how we use 'insert' to avoid duplicates, in case it was already
// scheduled for renewal anyway)
renewQueue.insert(cert)
}
}
// Reload certificates that merely need to be updated in memory
for _, oldCert := range reloadQueue {
timeLeft := expiresAt(oldCert.Leaf).Sub(time.Now().UTC())
log.Info("certificate expires soon, but is already renewed in storage; reloading stored certificate",
zap.Strings("identifiers", oldCert.Names),
zap.Duration("remaining", timeLeft))
cfg := configs[oldCert.hash]
// crucially, this happens OUTSIDE a lock on the certCache
_, err := cfg.reloadManagedCertificate(ctx, oldCert)
if err != nil {
log.Error("loading renewed certificate",
zap.Strings("identifiers", oldCert.Names),
zap.Error(err))
continue
}
}
// Renewal queue
for _, oldCert := range renewQueue {
cfg := configs[oldCert.hash]
err := certCache.queueRenewalTask(ctx, oldCert, cfg)
if err != nil {
log.Error("queueing renewal task",
zap.Strings("identifiers", oldCert.Names),
zap.Error(err))
continue
}
}
// Deletion queue
certCache.mu.Lock()
for _, cert := range deleteQueue {
certCache.removeCertificate(cert)
}
certCache.mu.Unlock()
return nil
}
func (certCache *Cache) queueRenewalTask(ctx context.Context, oldCert Certificate, cfg *Config) error {
log := certCache.logger.Named("maintenance")
timeLeft := expiresAt(oldCert.Leaf).Sub(time.Now().UTC())
log.Info("certificate expires soon; queuing for renewal",
zap.Strings("identifiers", oldCert.Names),
zap.Duration("remaining", timeLeft))
// Get the name which we should use to renew this certificate;
// we only support managing certificates with one name per cert,
// so this should be easy.
renewName := oldCert.Names[0]
// queue up this renewal job (is a no-op if already active or queued)
jm.Submit(cfg.Logger, "renew_"+renewName, func() error {
timeLeft := expiresAt(oldCert.Leaf).Sub(time.Now().UTC())
log.Info("attempting certificate renewal",
zap.Strings("identifiers", oldCert.Names),
zap.Duration("remaining", timeLeft))
// perform renewal - crucially, this happens OUTSIDE a lock on certCache
err := cfg.RenewCertAsync(ctx, renewName, false)
if err != nil {
if cfg.OnDemand != nil {
// loaded dynamically, remove dynamically
certCache.mu.Lock()
certCache.removeCertificate(oldCert)
certCache.mu.Unlock()
}
return fmt.Errorf("%v %v", oldCert.Names, err)
}
// successful renewal, so update in-memory cache by loading
// renewed certificate so it will be used with handshakes
_, err = cfg.reloadManagedCertificate(ctx, oldCert)
if err != nil {
return ErrNoRetry{fmt.Errorf("%v %v", oldCert.Names, err)}
}
return nil
})
return nil
}
// updateOCSPStaples updates the OCSP stapling in all
// eligible, cached certificates.
//
// OCSP maintenance strives to abide the relevant points on
// Ryan Sleevi's recommendations for good OCSP support:
// https://gist.github.com/sleevi/5efe9ef98961ecfb4da8
func (certCache *Cache) updateOCSPStaples(ctx context.Context) {
logger := certCache.logger.Named("maintenance")
// temporary structures to store updates or tasks
// so that we can keep our locks short-lived
type ocspUpdate struct {
rawBytes []byte
parsed *ocsp.Response
}
type updateQueueEntry struct {
cert Certificate
certHash string
lastNextUpdate time.Time
cfg *Config
}
type renewQueueEntry struct {
oldCert Certificate
cfg *Config
}
updated := make(map[string]ocspUpdate)
var updateQueue []updateQueueEntry // certs that need a refreshed staple
var renewQueue []renewQueueEntry // certs that need to be renewed (due to revocation)
// obtain brief read lock during our scan to see which staples need updating
certCache.mu.RLock()
for certHash, cert := range certCache.cache {
// no point in updating OCSP for expired or "synthetic" certificates
if cert.Leaf == nil || cert.Expired() {
continue
}
cfg, err := certCache.getConfig(cert)
if err != nil {
logger.Error("unable to get automation config for certificate; maintenance for this certificate will likely fail",
zap.Strings("identifiers", cert.Names),
zap.Error(err))
continue
}
// always try to replace revoked certificates, even if OCSP response is still fresh
if certShouldBeForceRenewed(cert) {
renewQueue = append(renewQueue, renewQueueEntry{
oldCert: cert,
cfg: cfg,
})
continue
}
// if the status is not fresh, get a new one
var lastNextUpdate time.Time
if cert.ocsp != nil {
lastNextUpdate = cert.ocsp.NextUpdate
if cert.ocsp.Status != ocsp.Unknown && freshOCSP(cert.ocsp) {
// no need to update our staple if still fresh and not Unknown
continue
}
}
updateQueue = append(updateQueue, updateQueueEntry{cert, certHash, lastNextUpdate, cfg})
}
certCache.mu.RUnlock()
// perform updates outside of any lock on certCache
for _, qe := range updateQueue {
cert := qe.cert
certHash := qe.certHash
lastNextUpdate := qe.lastNextUpdate
if qe.cfg == nil {
// this is bad if this happens, probably a programmer error (oops)
logger.Error("no configuration associated with certificate; unable to manage OCSP staples",
zap.Strings("identifiers", cert.Names))
continue
}
err := stapleOCSP(ctx, qe.cfg.OCSP, qe.cfg.Storage, &cert, nil)
if err != nil {
if cert.ocsp != nil {
// if there was no staple before, that's fine; otherwise we should log the error
logger.Error("stapling OCSP",
zap.Strings("identifiers", cert.Names),
zap.Error(err))
}
continue
}
// By this point, we've obtained the latest OCSP response.
// If there was no staple before, or if the response is updated, make
// sure we apply the update to all names on the certificate if
// the status is still Good.
if cert.ocsp != nil && cert.ocsp.Status == ocsp.Good && (lastNextUpdate.IsZero() || lastNextUpdate != cert.ocsp.NextUpdate) {
logger.Info("advancing OCSP staple",
zap.Strings("identifiers", cert.Names),
zap.Time("from", lastNextUpdate),
zap.Time("to", cert.ocsp.NextUpdate))
updated[certHash] = ocspUpdate{rawBytes: cert.Certificate.OCSPStaple, parsed: cert.ocsp}
}
// If the updated staple shows that the certificate was revoked, we should immediately renew it
if certShouldBeForceRenewed(cert) {
qe.cfg.emit(ctx, "cert_ocsp_revoked", map[string]any{
"subjects": cert.Names,
"certificate": cert,
"reason": cert.ocsp.RevocationReason,
"revoked_at": cert.ocsp.RevokedAt,
})
renewQueue = append(renewQueue, renewQueueEntry{
oldCert: cert,
cfg: qe.cfg,
})
}
}
// These write locks should be brief since we have all the info we need now.
for certKey, update := range updated {
certCache.mu.Lock()
if cert, ok := certCache.cache[certKey]; ok {
cert.ocsp = update.parsed
cert.Certificate.OCSPStaple = update.rawBytes
certCache.cache[certKey] = cert
}
certCache.mu.Unlock()
}
// We attempt to replace any certificates that were revoked.
// Crucially, this happens OUTSIDE a lock on the certCache.
for _, renew := range renewQueue {
_, err := renew.cfg.forceRenew(ctx, logger, renew.oldCert)
if err != nil {
logger.Info("forcefully renewing certificate due to REVOKED status",
zap.Strings("identifiers", renew.oldCert.Names),
zap.Error(err))
}
}
}
// storageHasNewerARI returns true if the configured storage has ARI that is newer
// than that of a certificate that is already loaded, along with the value from
// storage.
func (cfg *Config) storageHasNewerARI(ctx context.Context, cert Certificate) (bool, acme.RenewalInfo, error) {
storedCertData, err := cfg.loadStoredACMECertificateMetadata(ctx, cert)
if err != nil || storedCertData.RenewalInfo == nil {
return false, acme.RenewalInfo{}, err
}
// prefer stored info if it has a window and the loaded one doesn't,
// or if the one in storage has a later RetryAfter (though I suppose
// it's not guaranteed, typically those will move forward in time)
if (!cert.ari.HasWindow() && storedCertData.RenewalInfo.HasWindow()) ||
storedCertData.RenewalInfo.RetryAfter.After(*cert.ari.RetryAfter) {
return true, *storedCertData.RenewalInfo, nil
}
return false, acme.RenewalInfo{}, nil
}
// loadStoredACMECertificateMetadata loads the stored ACME certificate data
// from the cert's sidecar JSON file.
func (cfg *Config) loadStoredACMECertificateMetadata(ctx context.Context, cert Certificate) (acme.Certificate, error) {
metaBytes, err := cfg.Storage.Load(ctx, StorageKeys.SiteMeta(cert.issuerKey, cert.Names[0]))
if err != nil {
return acme.Certificate{}, fmt.Errorf("loading cert metadata: %w", err)
}
var certRes CertificateResource
if err = json.Unmarshal(metaBytes, &certRes); err != nil {
return acme.Certificate{}, fmt.Errorf("unmarshaling cert metadata: %w", err)
}
var acmeCert acme.Certificate
if err = json.Unmarshal(certRes.IssuerData, &acmeCert); err != nil {
return acme.Certificate{}, fmt.Errorf("unmarshaling potential ACME issuer metadata: %v", err)
}
return acmeCert, nil
}
// updateARI updates the cert's ACME renewal info, first by checking storage for a newer
// one, or getting it from the CA if needed. The updated info is stored in storage and
// updated in the cache. The certificate with the updated ARI is returned. If true is
// returned, the ARI window or selected time has changed, and the caller should check if
// the cert needs to be renewed now, even if there is an error.
func (cfg *Config) updateARI(ctx context.Context, cert Certificate, logger *zap.Logger) (updatedCert Certificate, changed bool, err error) {
logger = logger.With(
zap.Strings("identifiers", cert.Names),
zap.String("cert_hash", cert.hash),
zap.String("ari_unique_id", cert.ari.UniqueIdentifier),
zap.Time("cert_expiry", cert.Leaf.NotAfter))
updatedCert = cert
oldARI := cert.ari
// see if the stored value has been refreshed already by another instance
gotNewARI, newARI, err := cfg.storageHasNewerARI(ctx, cert)
// when we're all done, log if something about the schedule is different
// ("WARN" level because ARI window changing may be a sign of external trouble
// and we want to draw their attention to a potential explanation URL)
defer func() {
changed = !newARI.SameWindow(oldARI)
if changed {
logger.Warn("ARI window or selected renewal time changed",
zap.Time("prev_start", oldARI.SuggestedWindow.Start),
zap.Time("next_start", newARI.SuggestedWindow.Start),
zap.Time("prev_end", oldARI.SuggestedWindow.End),
zap.Time("next_end", newARI.SuggestedWindow.End),
zap.Time("prev_selected_time", oldARI.SelectedTime),
zap.Time("next_selected_time", newARI.SelectedTime),
zap.String("explanation_url", newARI.ExplanationURL))
}
}()
if err == nil && gotNewARI {
// great, storage has a newer one we can use
cfg.certCache.mu.Lock()
updatedCert = cfg.certCache.cache[cert.hash]
updatedCert.ari = newARI
cfg.certCache.cache[cert.hash] = updatedCert
cfg.certCache.mu.Unlock()
logger.Info("reloaded ARI with newer one in storage",
zap.Timep("next_refresh", newARI.RetryAfter),
zap.Time("renewal_time", newARI.SelectedTime))
return
}
if err != nil {
logger.Error("error while checking storage for updated ARI; updating ARI now", zap.Error(err))
}
// of the issuers configured, hopefully one of them is the ACME CA we got the cert from
for _, iss := range cfg.Issuers {
if acmeIss, ok := iss.(*ACMEIssuer); ok {
newARI, err = acmeIss.getRenewalInfo(ctx, cert) // be sure to use existing newARI variable so we can compare against old value in the defer
if err != nil {
// could be anything, but a common error might simply be the "wrong" ACME CA
// (meaning, different from the one that issued the cert, thus the only one
// that would have any ARI for it) if multiple ACME CAs are configured
logger.Error("failed updating renewal info from ACME CA",
zap.String("issuer", iss.IssuerKey()),
zap.Error(err))
continue
}
// when we get the latest ARI, the acme package will select a time within the window
// for us; of course, since it's random, it's likely different from the previously-
// selected time; but if the window doesn't change, there's no need to change the
// selected time (the acme package doesn't know the previous window to know better)
// ... so if the window hasn't changed we'll just put back the selected time
if newARI.SameWindow(oldARI) && !oldARI.SelectedTime.IsZero() {
newARI.SelectedTime = oldARI.SelectedTime
}
// then store the updated ARI (even if the window didn't change, the Retry-After
// likely did) in cache and storage
// be sure we get the cert from the cache while inside a lock to avoid logical races
cfg.certCache.mu.Lock()
updatedCert = cfg.certCache.cache[cert.hash]
updatedCert.ari = newARI
cfg.certCache.cache[cert.hash] = updatedCert
cfg.certCache.mu.Unlock()
// update the ARI value in storage
var certData acme.Certificate
certData, err = cfg.loadStoredACMECertificateMetadata(ctx, cert)
if err != nil {
err = fmt.Errorf("got new ARI from %s, but failed loading stored certificate metadata: %v", iss.IssuerKey(), err)
return
}
certData.RenewalInfo = &newARI
var certDataBytes, certResBytes []byte
certDataBytes, err = json.Marshal(certData)
if err != nil {
err = fmt.Errorf("got new ARI from %s, but failed marshaling certificate ACME metadata: %v", iss.IssuerKey(), err)
return
}
certResBytes, err = json.MarshalIndent(CertificateResource{
SANs: cert.Names,
IssuerData: certDataBytes,
}, "", "\t")
if err != nil {
err = fmt.Errorf("got new ARI from %s, but could not re-encode certificate metadata: %v", iss.IssuerKey(), err)
return
}
if err = cfg.Storage.Store(ctx, StorageKeys.SiteMeta(cert.issuerKey, cert.Names[0]), certResBytes); err != nil {
err = fmt.Errorf("got new ARI from %s, but could not store it with certificate metadata: %v", iss.IssuerKey(), err)
return
}
logger.Info("updated ACME renewal information",
zap.Time("selected_time", newARI.SelectedTime),
zap.Timep("next_update", newARI.RetryAfter),
zap.String("explanation_url", newARI.ExplanationURL))
return
}
}
err = fmt.Errorf("could not fully update ACME renewal info: either no ACME issuer configured for certificate, or all failed (make sure the ACME CA that issued the certificate is configured)")
return
}
// CleanStorageOptions specifies how to clean up a storage unit.
type CleanStorageOptions struct {
// Optional custom logger.
Logger *zap.Logger
// Optional ID of the instance initiating the cleaning.
InstanceID string
// If set, cleaning will be skipped if it was performed
// more recently than this interval.
Interval time.Duration
// Whether to clean cached OCSP staples.
OCSPStaples bool
// Whether to cleanup expired certificates, and if so,
// how long to let them stay after they've expired.
ExpiredCerts bool
ExpiredCertGracePeriod time.Duration
}
// CleanStorage removes assets which are no longer useful,
// according to opts.
func CleanStorage(ctx context.Context, storage Storage, opts CleanStorageOptions) error {
const (
lockName = "storage_clean"
storageKey = "last_clean.json"
)
if opts.Logger == nil {
opts.Logger = defaultLogger.Named("clean_storage")
}
opts.Logger = opts.Logger.With(zap.Any("storage", storage))
// storage cleaning should be globally exclusive
if err := storage.Lock(ctx, lockName); err != nil {
return fmt.Errorf("unable to acquire %s lock: %v", lockName, err)
}
defer func() {
if err := storage.Unlock(ctx, lockName); err != nil {
opts.Logger.Error("unable to release lock", zap.Error(err))
return
}
}()
// cleaning should not happen more often than the interval
if opts.Interval > 0 {
lastCleanBytes, err := storage.Load(ctx, storageKey)
if !errors.Is(err, fs.ErrNotExist) {
if err != nil {
return fmt.Errorf("loading last clean timestamp: %v", err)
}
var lastClean lastCleanPayload
err = json.Unmarshal(lastCleanBytes, &lastClean)
if err != nil {
return fmt.Errorf("decoding last clean data: %v", err)
}
lastTLSClean := lastClean["tls"]
if time.Since(lastTLSClean.Timestamp) < opts.Interval {
nextTime := time.Now().Add(opts.Interval)
opts.Logger.Info("storage cleaning happened too recently; skipping for now",
zap.String("instance", lastTLSClean.InstanceID),
zap.Time("try_again", nextTime),
zap.Duration("try_again_in", time.Until(nextTime)),
)
return nil
}
}
}
opts.Logger.Info("cleaning storage unit")
if opts.OCSPStaples {
err := deleteOldOCSPStaples(ctx, storage, opts.Logger)
if err != nil {
opts.Logger.Error("deleting old OCSP staples", zap.Error(err))
}
}
if opts.ExpiredCerts {
err := deleteExpiredCerts(ctx, storage, opts.Logger, opts.ExpiredCertGracePeriod)
if err != nil {
opts.Logger.Error("deleting expired certificates staples", zap.Error(err))
}
}
// TODO: delete stale locks?
// update the last-clean time
lastCleanBytes, err := json.Marshal(lastCleanPayload{
"tls": lastCleaned{
Timestamp: time.Now(),
InstanceID: opts.InstanceID,
},
})
if err != nil {
return fmt.Errorf("encoding last cleaned info: %v", err)
}
if err := storage.Store(ctx, storageKey, lastCleanBytes); err != nil {
return fmt.Errorf("storing last clean info: %v", err)
}
return nil
}
type lastCleanPayload map[string]lastCleaned
type lastCleaned struct {
Timestamp time.Time `json:"timestamp"`
InstanceID string `json:"instance_id,omitempty"`
}
func deleteOldOCSPStaples(ctx context.Context, storage Storage, logger *zap.Logger) error {
ocspKeys, err := storage.List(ctx, prefixOCSP, false)
if err != nil {
// maybe just hasn't been created yet; no big deal
return nil
}
for _, key := range ocspKeys {
// if context was cancelled, quit early; otherwise proceed
select {
case <-ctx.Done():
return ctx.Err()
default:
}
ocspBytes, err := storage.Load(ctx, key)
if err != nil {
logger.Error("while deleting old OCSP staples, unable to load staple file", zap.Error(err))
continue
}
resp, err := ocsp.ParseResponse(ocspBytes, nil)
if err != nil {
// contents are invalid; delete it
err = storage.Delete(ctx, key)
if err != nil {
logger.Error("purging corrupt staple file", zap.String("storage_key", key), zap.Error(err))
}
continue
}
if time.Now().After(resp.NextUpdate) {
// response has expired; delete it
err = storage.Delete(ctx, key)
if err != nil {
logger.Error("purging expired staple file", zap.String("storage_key", key), zap.Error(err))
}
}
}
return nil
}
func deleteExpiredCerts(ctx context.Context, storage Storage, logger *zap.Logger, gracePeriod time.Duration) error {
issuerKeys, err := storage.List(ctx, prefixCerts, false)
if err != nil {
// maybe just hasn't been created yet; no big deal
return nil
}
for _, issuerKey := range issuerKeys {
siteKeys, err := storage.List(ctx, issuerKey, false)
if err != nil {
logger.Error("listing contents", zap.String("issuer_key", issuerKey), zap.Error(err))
continue
}
for _, siteKey := range siteKeys {
// if context was cancelled, quit early; otherwise proceed
select {
case <-ctx.Done():
return ctx.Err()
default:
}
siteAssets, err := storage.List(ctx, siteKey, false)
if err != nil {
logger.Error("listing site contents", zap.String("site_key", siteKey), zap.Error(err))
continue
}
for _, assetKey := range siteAssets {
if path.Ext(assetKey) != ".crt" {
continue
}
certFile, err := storage.Load(ctx, assetKey)
if err != nil {
return fmt.Errorf("loading certificate file %s: %v", assetKey, err)
}
block, _ := pem.Decode(certFile)
if block == nil || block.Type != "CERTIFICATE" {
return fmt.Errorf("certificate file %s does not contain PEM-encoded certificate", assetKey)
}
cert, err := x509.ParseCertificate(block.Bytes)
if err != nil {
return fmt.Errorf("certificate file %s is malformed; error parsing PEM: %v", assetKey, err)
}
if expiredTime := time.Since(expiresAt(cert)); expiredTime >= gracePeriod {
logger.Info("certificate expired beyond grace period; cleaning up",
zap.String("asset_key", assetKey),
zap.Duration("expired_for", expiredTime),
zap.Duration("grace_period", gracePeriod))
baseName := strings.TrimSuffix(assetKey, ".crt")
for _, relatedAsset := range []string{
assetKey,
baseName + ".key",
baseName + ".json",
} {
logger.Info("deleting asset because resource expired", zap.String("asset_key", relatedAsset))
err := storage.Delete(ctx, relatedAsset)
if err != nil {
logger.Error("could not clean up asset related to expired certificate",
zap.String("base_name", baseName),
zap.String("related_asset", relatedAsset),
zap.Error(err))
}
}
}
}
// update listing; if folder is empty, delete it
siteAssets, err = storage.List(ctx, siteKey, false)
if err != nil {
continue
}
if len(siteAssets) == 0 {
logger.Info("deleting site folder because key is empty", zap.String("site_key", siteKey))
err := storage.Delete(ctx, siteKey)
if err != nil {
return fmt.Errorf("deleting empty site folder %s: %v", siteKey, err)
}
}
}
}
return nil
}
// forceRenew forcefully renews cert and replaces it in the cache, and returns the new certificate. It is intended
// for use primarily in the case of cert revocation. This MUST NOT be called within a lock on cfg.certCacheMu.
func (cfg *Config) forceRenew(ctx context.Context, logger *zap.Logger, cert Certificate) (Certificate, error) {
if cert.ocsp != nil && cert.ocsp.Status == ocsp.Revoked {
logger.Warn("OCSP status for managed certificate is REVOKED; attempting to replace with new certificate",
zap.Strings("identifiers", cert.Names),
zap.Time("expiration", expiresAt(cert.Leaf)))
} else {
logger.Warn("forcefully renewing certificate",
zap.Strings("identifiers", cert.Names),
zap.Time("expiration", expiresAt(cert.Leaf)))
}
renewName := cert.Names[0]
// if revoked for key compromise, we can't be sure whether the storage of
// the key is still safe; however, we KNOW the old key is not safe, and we
// can only hope by the time of revocation that storage has been secured;
// key management is not something we want to get into, but in this case
// it seems prudent to replace the key - and since renewal requires reuse
// of a prior key, we can't do a "renew" to replace the cert if we need a
// new key, so we'll have to do an obtain instead
var obtainInsteadOfRenew bool
if cert.ocsp != nil && cert.ocsp.RevocationReason == acme.ReasonKeyCompromise {
err := cfg.moveCompromisedPrivateKey(ctx, cert, logger)
if err != nil {
logger.Error("could not remove compromised private key from use",
zap.Strings("identifiers", cert.Names),
zap.String("issuer", cert.issuerKey),
zap.Error(err))
}
obtainInsteadOfRenew = true
}
var err error
if obtainInsteadOfRenew {
err = cfg.ObtainCertAsync(ctx, renewName)
} else {
// notice that we force renewal; otherwise, it might see that the
// certificate isn't close to expiring and return, but we really
// need a replacement certificate! see issue #4191
err = cfg.RenewCertAsync(ctx, renewName, true)
}
if err != nil {
if cert.ocsp != nil && cert.ocsp.Status == ocsp.Revoked {
// probably better to not serve a revoked certificate at all
logger.Error("unable to obtain new to certificate after OCSP status of REVOKED; removing from cache",
zap.Strings("identifiers", cert.Names),
zap.Error(err))
cfg.certCache.mu.Lock()
cfg.certCache.removeCertificate(cert)
cfg.certCache.mu.Unlock()
}
return cert, fmt.Errorf("unable to forcefully get new certificate for %v: %w", cert.Names, err)
}
return cfg.reloadManagedCertificate(ctx, cert)
}
// moveCompromisedPrivateKey moves the private key for cert to a ".compromised" file
// by copying the data to the new file, then deleting the old one.
func (cfg *Config) moveCompromisedPrivateKey(ctx context.Context, cert Certificate, logger *zap.Logger) error {
privKeyStorageKey := StorageKeys.SitePrivateKey(cert.issuerKey, cert.Names[0])
privKeyPEM, err := cfg.Storage.Load(ctx, privKeyStorageKey)
if err != nil {
return err
}
compromisedPrivKeyStorageKey := privKeyStorageKey + ".compromised"
err = cfg.Storage.Store(ctx, compromisedPrivKeyStorageKey, privKeyPEM)
if err != nil {
// better safe than sorry: as a last resort, try deleting the key so it won't be reused
cfg.Storage.Delete(ctx, privKeyStorageKey)
return err
}
err = cfg.Storage.Delete(ctx, privKeyStorageKey)
if err != nil {
return err
}
logger.Info("removed certificate's compromised private key from use",
zap.String("storage_path", compromisedPrivKeyStorageKey),
zap.Strings("identifiers", cert.Names),
zap.String("issuer", cert.issuerKey))
return nil
}
// certShouldBeForceRenewed returns true if cert should be forcefully renewed
// (like if it is revoked according to its OCSP response).
func certShouldBeForceRenewed(cert Certificate) bool {
return cert.managed &&
len(cert.Names) > 0 &&
cert.ocsp != nil &&
cert.ocsp.Status == ocsp.Revoked
}
type certList []Certificate
// insert appends cert to the list if it is not already in the list.
// Efficiency: O(n)
func (certs *certList) insert(cert Certificate) {
for _, c := range *certs {
if c.hash == cert.hash {
return
}
}
*certs = append(*certs, cert)
}
const (
// DefaultRenewCheckInterval is how often to check certificates for expiration.
// Scans are very lightweight, so this can be semi-frequent. This default should
// be smaller than <Minimum Cert Lifetime>*DefaultRenewalWindowRatio/3, which
// gives certificates plenty of chance to be renewed on time.
DefaultRenewCheckInterval = 10 * time.Minute
// DefaultRenewalWindowRatio is how much of a certificate's lifetime becomes the
// renewal window. The renewal window is the span of time at the end of the
// certificate's validity period in which it should be renewed. A default value
// of ~1/3 is pretty safe and recommended for most certificates.
DefaultRenewalWindowRatio = 1.0 / 3.0
// DefaultOCSPCheckInterval is how often to check if OCSP stapling needs updating.
DefaultOCSPCheckInterval = 1 * time.Hour
)