Skip to content

Commit

Permalink
fix: this fixes a critical bug where multiple stat collectors were co…
Browse files Browse the repository at this point in the history
…mpeting and never stopping. see #2838 (#2839)
  • Loading branch information
amir20 committed Mar 21, 2024
1 parent 83ef59c commit b72ee29
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 24 deletions.
1 change: 0 additions & 1 deletion internal/docker/client.go
Expand Up @@ -224,7 +224,6 @@ func (d *_client) ContainerStats(ctx context.Context, id string, stats chan<- Co
return err
}

log.Debugf("starting to stream stats for: %s", id)
defer response.Body.Close()
decoder := json.NewDecoder(response.Body)
var v *types.StatsJSON
Expand Down
2 changes: 1 addition & 1 deletion internal/docker/container_store.go
Expand Up @@ -89,7 +89,7 @@ func (s *ContainerStore) init(ctx context.Context) {
for {
select {
case event := <-events:
log.Debugf("received event: %+v", event)
log.Tracef("received event: %+v", event)
switch event.Name {
case "start":
if container, err := s.client.FindContainer(event.ActorID); err == nil {
Expand Down
46 changes: 24 additions & 22 deletions internal/docker/stats_collector.go
Expand Up @@ -62,34 +62,44 @@ func (c *StatsCollector) Stop() {
func (c *StatsCollector) reset() {
c.mu.Lock()
defer c.mu.Unlock()
log.Debug("resetting timer for container stats collector")
if c.timer != nil {
c.timer.Stop()
}
c.timer = nil
}

func streamStats(parent context.Context, sc *StatsCollector, id string) {
ctx, cancel := context.WithCancel(parent)
sc.cancelers.Store(id, cancel)
log.Debugf("starting to stream stats for: %s", id)
if err := sc.client.ContainerStats(ctx, id, sc.stream); err != nil {
log.Debugf("stopping to stream stats for: %s", id)
if !errors.Is(err, context.Canceled) && !errors.Is(err, io.EOF) {
log.Errorf("unexpected error when streaming container stats: %v", err)
}
}
}

// Start starts the stats collector and blocks until it's stopped. It returns true if the collector was stopped, false if it was already running
func (sc *StatsCollector) Start(ctx context.Context) bool {
func (sc *StatsCollector) Start(parentCtx context.Context) bool {
sc.reset()
if sc.totalStarted.Add(1) > 1 {
sc.totalStarted.Add(1)

var ctx context.Context

sc.mu.Lock()
if sc.stopper != nil {
sc.mu.Unlock()
return false
}
sc.mu.Lock()
ctx, sc.stopper = context.WithCancel(ctx)
ctx, sc.stopper = context.WithCancel(parentCtx)
sc.mu.Unlock()

if containers, err := sc.client.ListContainers(); err == nil {
for _, c := range containers {
if c.State == "running" {
go func(client Client, id string) {
ctx, cancel := context.WithCancel(ctx)
sc.cancelers.Store(id, cancel)
if err := client.ContainerStats(ctx, id, sc.stream); err != nil {
if !errors.Is(err, context.Canceled) && !errors.Is(err, io.EOF) {
log.Errorf("unexpected error when streaming container stats: %v", err)
}
}
}(sc.client, c.ID)
go streamStats(ctx, sc, c.ID)
}
}
} else {
Expand All @@ -102,15 +112,7 @@ func (sc *StatsCollector) Start(ctx context.Context) bool {
for event := range events {
switch event.Name {
case "start":
go func(client Client, id string) {
ctx, cancel := context.WithCancel(ctx)
sc.cancelers.Store(id, cancel)
if err := client.ContainerStats(ctx, id, sc.stream); err != nil {
if !errors.Is(err, context.Canceled) && !errors.Is(err, io.EOF) {
log.Errorf("unexpected error when streaming container stats: %v", err)
}
}
}(sc.client, event.ActorID)
go streamStats(ctx, sc, event.ActorID)

case "die":
if cancel, ok := sc.cancelers.LoadAndDelete(event.ActorID); ok {
Expand Down

0 comments on commit b72ee29

Please sign in to comment.