Skip to content

Commit 849f400

Browse files
dilyevskyclaude
andcommitted
[tunnel] fix race between agent registration and endpoint address allocation
The connect handler was calling onConnect (which allocates the infra Endpoint) before adding the agent to TunnelNode.Status.Agents. This caused the InfraEndpointReconciler to miss the agent when writing the overlay address, leaving the connection permanently stuck with no address. Fixes: - Reorder connect handler to register agent before endpoint allocation - Fix upsertAgentStatus range loop copy bug (modified copy, not slice) - Requeue ReconcileWithClient when agent address is pending Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent be67ec6 commit 849f400

File tree

1 file changed

+20
-9
lines changed

1 file changed

+20
-9
lines changed

pkg/tunnel/server.go

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -374,9 +374,9 @@ func (t *TunnelServer) Start(ctx context.Context) error {
374374
}
375375

376376
func upsertAgentStatus(s *corev1alpha.TunnelNodeStatus, agent *corev1alpha.AgentStatus) {
377-
for _, a := range s.Agents {
378-
if a.Name == agent.Name {
379-
a = *agent
377+
for i := range s.Agents {
378+
if s.Agents[i].Name == agent.Name {
379+
s.Agents[i] = *agent
380380
return
381381
}
382382
}
@@ -530,11 +530,9 @@ func (t *TunnelServer) makeSingleConnectHandler(ctx context.Context, qConn quic.
530530

531531
t.conns.Set(connID, conn)
532532

533-
// Invoke onConnect callback if configured.
534-
if t.options.onConnect != nil {
535-
t.options.onConnect(ctx, connID, tn)
536-
}
537-
533+
// Register the agent in TunnelNode status before allocating the
534+
// endpoint so the InfraEndpointReconciler can find the agent when
535+
// it writes the overlay address.
538536
logger.Info("Updating agent status")
539537

540538
agent := &corev1alpha.AgentStatus{
@@ -562,6 +560,13 @@ func (t *TunnelServer) makeSingleConnectHandler(ctx context.Context, qConn quic.
562560
logger.Error("Failed to update agent status", slog.Any("error", err))
563561
}
564562

563+
// Invoke onConnect callback if configured.
564+
// This triggers endpoint allocation; agent must be in TunnelNode
565+
// status first so the address reconciler can find it.
566+
if t.options.onConnect != nil {
567+
t.options.onConnect(ctx, connID, tn)
568+
}
569+
565570
// Blocking wait for the lifetime of the tunnel connection.
566571
select {
567572
case <-r.Context().Done():
@@ -763,6 +768,7 @@ func (t *TunnelServer) ReconcileWithClient(ctx context.Context, c client.Client,
763768
}
764769

765770
// Configure agent addresses from TunnelNode status.
771+
var pendingAddress bool
766772
for _, agent := range node.Status.Agents {
767773
log := log.WithValues("agent", agent.Name)
768774

@@ -774,7 +780,8 @@ func (t *TunnelServer) ReconcileWithClient(ctx context.Context, c client.Client,
774780

775781
// Parse IPv6 address from agent status.
776782
if agent.AgentAddress == "" {
777-
log.Info("Agent address is empty")
783+
log.Info("Agent address is empty, will requeue")
784+
pendingAddress = true
778785
continue
779786
}
780787
addrv6, err := netip.ParseAddr(agent.AgentAddress)
@@ -799,5 +806,9 @@ func (t *TunnelServer) ReconcileWithClient(ctx context.Context, c client.Client,
799806
}
800807
}
801808

809+
if pendingAddress {
810+
return ctrl.Result{RequeueAfter: 1 * time.Second}, nil
811+
}
812+
802813
return ctrl.Result{}, nil
803814
}

0 commit comments

Comments
 (0)