forked from syncthing/syncthing
/
monitor.go
581 lines (496 loc) · 15.1 KB
/
monitor.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
// Copyright (C) 2014 The Syncthing Authors.
//
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this file,
// You can obtain one at https://mozilla.org/MPL/2.0/.
package main
import (
"bufio"
"context"
"fmt"
"io"
"os"
"os/exec"
"os/signal"
"path/filepath"
"runtime"
"strings"
"syscall"
"time"
"github.com/syncthing/syncthing/lib/events"
"github.com/syncthing/syncthing/lib/fs"
"github.com/syncthing/syncthing/lib/locations"
"github.com/syncthing/syncthing/lib/osutil"
"github.com/syncthing/syncthing/lib/protocol"
"github.com/syncthing/syncthing/lib/sync"
"github.com/syncthing/syncthing/lib/syncthing"
)
var (
stdoutFirstLines []string // The first 10 lines of stdout
stdoutLastLines []string // The last 50 lines of stdout
stdoutMut = sync.NewMutex()
)
const (
countRestarts = 4
loopThreshold = 60 * time.Second
logFileAutoCloseDelay = 5 * time.Second
logFileMaxOpenTime = time.Minute
panicUploadMaxWait = 30 * time.Second
panicUploadNoticeWait = 10 * time.Second
)
func monitorMain(runtimeOptions RuntimeOptions) {
l.SetPrefix("[monitor] ")
var dst io.Writer = os.Stdout
logFile := runtimeOptions.logFile
if logFile != "-" {
if expanded, err := fs.ExpandTilde(logFile); err == nil {
logFile = expanded
}
var fileDst io.Writer
if runtimeOptions.logMaxSize > 0 {
open := func(name string) (io.WriteCloser, error) {
return newAutoclosedFile(name, logFileAutoCloseDelay, logFileMaxOpenTime), nil
}
fileDst = newRotatedFile(logFile, open, int64(runtimeOptions.logMaxSize), runtimeOptions.logMaxFiles)
} else {
fileDst = newAutoclosedFile(logFile, logFileAutoCloseDelay, logFileMaxOpenTime)
}
if runtime.GOOS == "windows" {
// Translate line breaks to Windows standard
fileDst = osutil.ReplacingWriter{
Writer: fileDst,
From: '\n',
To: []byte{'\r', '\n'},
}
}
// Log to both stdout and file.
dst = io.MultiWriter(dst, fileDst)
l.Infof(`Log output saved to file "%s"`, logFile)
}
args := os.Args
var restarts [countRestarts]time.Time
stopSign := make(chan os.Signal, 1)
signal.Notify(stopSign, os.Interrupt, sigTerm)
restartSign := make(chan os.Signal, 1)
sigHup := syscall.Signal(1)
signal.Notify(restartSign, sigHup)
childEnv := childEnv()
first := true
for {
maybeReportPanics()
if t := time.Since(restarts[0]); t < loopThreshold {
l.Warnf("%d restarts in %v; not retrying further", countRestarts, t)
os.Exit(syncthing.ExitError.AsInt())
}
copy(restarts[0:], restarts[1:])
restarts[len(restarts)-1] = time.Now()
cmd := exec.Command(args[0], args[1:]...)
cmd.Env = childEnv
stderr, err := cmd.StderrPipe()
if err != nil {
panic(err)
}
stdout, err := cmd.StdoutPipe()
if err != nil {
panic(err)
}
l.Debugln("Starting syncthing")
err = cmd.Start()
if err != nil {
l.Warnln("Error starting the main Syncthing process:", err)
panic("Error starting the main Syncthing process")
}
stdoutMut.Lock()
stdoutFirstLines = make([]string, 0, 10)
stdoutLastLines = make([]string, 0, 50)
stdoutMut.Unlock()
wg := sync.NewWaitGroup()
wg.Add(1)
go func() {
copyStderr(stderr, dst)
wg.Done()
}()
wg.Add(1)
go func() {
copyStdout(stdout, dst)
wg.Done()
}()
exit := make(chan error)
go func() {
wg.Wait()
exit <- cmd.Wait()
}()
stopped := false
select {
case s := <-stopSign:
l.Infof("Signal %d received; exiting", s)
cmd.Process.Signal(sigTerm)
err = <-exit
stopped = true
case s := <-restartSign:
l.Infof("Signal %d received; restarting", s)
cmd.Process.Signal(sigHup)
err = <-exit
case err = <-exit:
}
if err == nil {
// Successful exit indicates an intentional shutdown
os.Exit(syncthing.ExitSuccess.AsInt())
}
if exiterr, ok := err.(*exec.ExitError); ok {
exitCode := exiterr.ExitCode()
if stopped || runtimeOptions.noRestart {
os.Exit(exitCode)
}
if exitCode == syncthing.ExitUpgrade.AsInt() {
// Restart the monitor process to release the .old
// binary as part of the upgrade process.
l.Infoln("Restarting monitor...")
if err = restartMonitor(args); err != nil {
l.Warnln("Restart:", err)
}
os.Exit(exitCode)
}
}
if runtimeOptions.noRestart {
os.Exit(syncthing.ExitError.AsInt())
}
l.Infoln("Syncthing exited:", err)
time.Sleep(1 * time.Second)
if first {
// Let the next child process know that this is not the first time
// it's starting up.
childEnv = append(childEnv, "STRESTART=yes")
first = false
}
}
}
func copyStderr(stderr io.Reader, dst io.Writer) {
br := bufio.NewReader(stderr)
var panicFd *os.File
defer func() {
if panicFd != nil {
_ = panicFd.Close()
maybeReportPanics()
}
}()
for {
line, err := br.ReadString('\n')
if err != nil {
return
}
if panicFd == nil {
dst.Write([]byte(line))
if strings.Contains(line, "SIGILL") {
l.Warnln(`
*******************************************************************************
* Crash due to illegal instruction detected. This is most likely due to a CPU *
* incompatibility with the high performance hashing package. Switching to the *
* standard hashing package instead. Please report this issue at: *
* *
* https://github.com/syncthing/syncthing/issues *
* *
* Include the details of your CPU. *
*******************************************************************************
`)
os.Setenv("STHASHING", "standard")
return
}
if strings.HasPrefix(line, "panic:") || strings.HasPrefix(line, "fatal error:") {
panicFd, err = os.Create(locations.GetTimestamped(locations.PanicLog))
if err != nil {
l.Warnln("Create panic log:", err)
continue
}
l.Warnf("Panic detected, writing to \"%s\"", panicFd.Name())
if strings.Contains(line, "leveldb") && strings.Contains(line, "corrupt") {
l.Warnln(`
*********************************************************************************
* Crash due to corrupt database. *
* *
* This crash usually occurs due to one of the following reasons: *
* - Syncthing being stopped abruptly (killed/loss of power) *
* - Bad hardware (memory/disk issues) *
* - Software that affects disk writes (SSD caching software and simillar) *
* *
* Please see the following URL for instructions on how to recover: *
* https://docs.syncthing.net/users/faq.html#my-syncthing-database-is-corrupt *
*********************************************************************************
`)
} else {
l.Warnln("Please check for existing issues with similar panic message at https://github.com/syncthing/syncthing/issues/")
l.Warnln("If no issue with similar panic message exists, please create a new issue with the panic log attached")
}
stdoutMut.Lock()
for _, line := range stdoutFirstLines {
panicFd.WriteString(line)
}
panicFd.WriteString("...\n")
for _, line := range stdoutLastLines {
panicFd.WriteString(line)
}
stdoutMut.Unlock()
}
panicFd.WriteString("Panic at " + time.Now().Format(time.RFC3339) + "\n")
}
if panicFd != nil {
panicFd.WriteString(line)
}
}
}
func copyStdout(stdout io.Reader, dst io.Writer) {
br := bufio.NewReader(stdout)
for {
line, err := br.ReadString('\n')
if err != nil {
return
}
stdoutMut.Lock()
if len(stdoutFirstLines) < cap(stdoutFirstLines) {
stdoutFirstLines = append(stdoutFirstLines, line)
} else {
if l := len(stdoutLastLines); l == cap(stdoutLastLines) {
stdoutLastLines = stdoutLastLines[:l-1]
}
stdoutLastLines = append(stdoutLastLines, line)
}
stdoutMut.Unlock()
dst.Write([]byte(line))
}
}
func restartMonitor(args []string) error {
if runtime.GOOS != "windows" {
// syscall.Exec is the cleanest way to restart on Unixes as it
// replaces the current process with the new one, keeping the pid and
// controlling terminal and so on
return restartMonitorUnix(args)
}
// but it isn't supported on Windows, so there we start a normal
// exec.Command and return.
return restartMonitorWindows(args)
}
func restartMonitorUnix(args []string) error {
if !strings.ContainsRune(args[0], os.PathSeparator) {
// The path to the binary doesn't contain a slash, so it should be
// found in $PATH.
binary, err := exec.LookPath(args[0])
if err != nil {
return err
}
args[0] = binary
}
return syscall.Exec(args[0], args, os.Environ())
}
func restartMonitorWindows(args []string) error {
cmd := exec.Command(args[0], args[1:]...)
// Retain the standard streams
cmd.Stderr = os.Stderr
cmd.Stdout = os.Stdout
cmd.Stdin = os.Stdin
return cmd.Start()
}
// rotatedFile keeps a set of rotating logs. There will be the base file plus up
// to maxFiles rotated ones, each ~ maxSize bytes large.
type rotatedFile struct {
name string
create createFn
maxSize int64 // bytes
maxFiles int
currentFile io.WriteCloser
currentSize int64
}
// the createFn should act equivalently to os.Create
type createFn func(name string) (io.WriteCloser, error)
func newRotatedFile(name string, create createFn, maxSize int64, maxFiles int) *rotatedFile {
return &rotatedFile{
name: name,
create: create,
maxSize: maxSize,
maxFiles: maxFiles,
}
}
func (r *rotatedFile) Write(bs []byte) (int, error) {
// Check if we're about to exceed the max size, and if so close this
// file so we'll start on a new one.
if r.currentSize+int64(len(bs)) > r.maxSize {
r.currentFile.Close()
r.currentFile = nil
r.currentSize = 0
}
// If we have no current log, rotate old files out of the way and create
// a new one.
if r.currentFile == nil {
r.rotate()
fd, err := r.create(r.name)
if err != nil {
return 0, err
}
r.currentFile = fd
}
n, err := r.currentFile.Write(bs)
r.currentSize += int64(n)
return n, err
}
func (r *rotatedFile) rotate() {
// The files are named "name", "name.0", "name.1", ...
// "name.(r.maxFiles-1)". Increase the numbers on the
// suffixed ones.
for i := r.maxFiles - 1; i > 0; i-- {
from := numberedFile(r.name, i-1)
to := numberedFile(r.name, i)
err := os.Rename(from, to)
if err != nil && !os.IsNotExist(err) {
fmt.Println("LOG: Rotating logs:", err)
}
}
// Rename the base to base.0
err := os.Rename(r.name, numberedFile(r.name, 0))
if err != nil && !os.IsNotExist(err) {
fmt.Println("LOG: Rotating logs:", err)
}
}
// numberedFile adds the number between the file name and the extension.
func numberedFile(name string, num int) string {
ext := filepath.Ext(name) // contains the dot
withoutExt := name[:len(name)-len(ext)]
return fmt.Sprintf("%s.%d%s", withoutExt, num, ext)
}
// An autoclosedFile is an io.WriteCloser that opens itself for appending on
// Write() and closes itself after an interval of no writes (closeDelay) or
// when the file has been open for too long (maxOpenTime). A call to Write()
// will return any error that happens on the resulting Open() call too. Errors
// on automatic Close() calls are silently swallowed...
type autoclosedFile struct {
name string // path to write to
closeDelay time.Duration // close after this long inactivity
maxOpenTime time.Duration // or this long after opening
fd io.WriteCloser // underlying WriteCloser
opened time.Time // timestamp when the file was last opened
closed chan struct{} // closed on Close(), stops the closerLoop
closeTimer *time.Timer // fires closeDelay after a write
mut sync.Mutex
}
func newAutoclosedFile(name string, closeDelay, maxOpenTime time.Duration) *autoclosedFile {
f := &autoclosedFile{
name: name,
closeDelay: closeDelay,
maxOpenTime: maxOpenTime,
mut: sync.NewMutex(),
closed: make(chan struct{}),
closeTimer: time.NewTimer(time.Minute),
}
go f.closerLoop()
return f
}
func (f *autoclosedFile) Write(bs []byte) (int, error) {
f.mut.Lock()
defer f.mut.Unlock()
// Make sure the file is open for appending
if err := f.ensureOpen(); err != nil {
return 0, err
}
// If we haven't run into the maxOpenTime, postpone close for another
// closeDelay
if time.Since(f.opened) < f.maxOpenTime {
f.closeTimer.Reset(f.closeDelay)
}
return f.fd.Write(bs)
}
func (f *autoclosedFile) Close() error {
f.mut.Lock()
defer f.mut.Unlock()
// Stop the timer and closerLoop() routine
f.closeTimer.Stop()
close(f.closed)
// Close the file, if it's open
if f.fd != nil {
return f.fd.Close()
}
return nil
}
// Must be called with f.mut held!
func (f *autoclosedFile) ensureOpen() error {
if f.fd != nil {
// File is already open
return nil
}
// We open the file for write only, and create it if it doesn't exist.
flags := os.O_WRONLY | os.O_CREATE
if f.opened.IsZero() {
// This is the first time we are opening the file. We should truncate
// it to better emulate an os.Create() call.
flags |= os.O_TRUNC
} else {
// The file was already opened once, so we should append to it.
flags |= os.O_APPEND
}
fd, err := os.OpenFile(f.name, flags, 0644)
if err != nil {
return err
}
f.fd = fd
f.opened = time.Now()
return nil
}
func (f *autoclosedFile) closerLoop() {
for {
select {
case <-f.closeTimer.C:
// Close the file when the timer expires.
f.mut.Lock()
if f.fd != nil {
f.fd.Close() // errors, schmerrors
f.fd = nil
}
f.mut.Unlock()
case <-f.closed:
return
}
}
}
// Returns the desired child environment, properly filtered and added to.
func childEnv() []string {
var env []string
for _, str := range os.Environ() {
if strings.HasPrefix(str, "STNORESTART=") {
continue
}
if strings.HasPrefix(str, "STMONITORED=") {
continue
}
env = append(env, str)
}
env = append(env, "STMONITORED=yes")
return env
}
// maybeReportPanics tries to figure out if crash reporting is on or off,
// and reports any panics it can find if it's enabled. We spend at most
// panicUploadMaxWait uploading panics...
func maybeReportPanics() {
// Try to get a config to see if/where panics should be reported.
cfg, err := loadOrDefaultConfig(protocol.EmptyDeviceID, events.NoopLogger)
if err != nil {
l.Warnln("Couldn't load config; not reporting crash")
return
}
// Bail if we're not supposed to report panics.
opts := cfg.Options()
if !opts.CREnabled {
return
}
// Set up a timeout on the whole operation.
ctx, cancel := context.WithTimeout(context.Background(), panicUploadMaxWait)
defer cancel()
// Print a notice if the upload takes a long time.
go func() {
select {
case <-ctx.Done():
return
case <-time.After(panicUploadNoticeWait):
l.Warnln("Uploading crash reports is taking a while, please wait...")
}
}()
// Report the panics.
dir := locations.GetBaseDir(locations.ConfigBaseDir)
uploadPanicLogs(ctx, opts.CRURL, dir)
}