apoxy-dev
diff --git a/‎pkg/tunnel/client.go‎
Lines changed: 2 additions & 0 deletions b/‎pkg/tunnel/client.go‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎pkg/tunnel/metrics/metrics.go‎
Lines changed: 45 additions & 0 deletions b/‎pkg/tunnel/metrics/metrics.go‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎pkg/tunnel/metrics/process_id_test.go‎
Lines changed: 86 additions & 0 deletions b/‎pkg/tunnel/metrics/process_id_test.go‎
Lines changed: 86 additions & 0 deletions
diff --git a/‎pkg/tunnel/metrics/reexport.go‎
Lines changed: 73 additions & 29 deletions b/‎pkg/tunnel/metrics/reexport.go‎
Lines changed: 73 additions & 29 deletions
@@ -30,6 +30,7 @@ import (
 	alog "github.com/apoxy-dev/apoxy/pkg/log"
 	"github.com/apoxy-dev/apoxy/pkg/tunnel/bfdl"
 	tunnelconn "github.com/apoxy-dev/apoxy/pkg/tunnel/connection"
+	"github.com/apoxy-dev/apoxy/pkg/tunnel/metrics"
 	"github.com/apoxy-dev/apoxy/pkg/tunnel/router"
 )
 
@@ -292,6 +293,7 @@ func (d *TunnelDialer) Dial(
 	for k, v := range options.labels {
 		q.Add("label."+k, v)
 	}
+	q.Add(metrics.QueryParamAgentProcessID, metrics.AgentProcessID())
 	addrUrl.RawQuery = q.Encode()
 
 	tmpl, err := uritemplate.New(addrUrl.String())
 
@@ -1,9 +1,12 @@
 package metrics
 
 import (
+	"os"
+	"regexp"
 	"sync"
 	"time"
 
+	"github.com/google/uuid"
 	"github.com/prometheus/client_golang/prometheus"
 	"sigs.k8s.io/controller-runtime/pkg/metrics"
 
@@ -13,6 +16,48 @@ import (
 // startTime is the time the process started. Used for uptime calculation.
 var startTime = time.Now()
 
+// QueryParamAgentProcessID is the CONNECT-IP query-string key the agent uses
+// to tell the server its stable per-process ID. Referenced by both the client
+// and the server handler so a rename can't silently break the wire.
+const QueryParamAgentProcessID = "agent_process_id"
+
+// processID is stable for the process lifetime so callers can distinguish
+// "same process with multiple conns" from "multiple processes each with one
+// conn". Prefers a CRI container ID (cross-refs kubelet/containerd metadata)
+// and falls back to a UUID when none is detectable.
+var processID = initProcessID()
+
+// containerIDRegex matches the 64-char hex token that CRI runtimes
+// (containerd, cri-o, docker, podman) embed in cgroup paths. Covers both
+// cgroup v1 and v2 layouts and the common systemd-slice wrappers
+// (`cri-containerd-<id>.scope`, `docker-<id>.scope`, etc).
+var containerIDRegex = regexp.MustCompile(`[0-9a-f]{64}`)
+
+func initProcessID() string {
+	// Linux-only: on macOS/Windows the read fails and we fall back to a UUID.
+	// Both paths rotate on container/process restart, so cardinality is bounded
+	// by the same restart rate either way.
+	if id := detectContainerID("/proc/self/cgroup"); id != "" {
+		return id
+	}
+	return uuid.NewString()
+}
+
+func detectContainerID(path string) string {
+	data, err := os.ReadFile(path)
+	if err != nil {
+		return ""
+	}
+	return parseCgroupForContainerID(data)
+}
+
+func parseCgroupForContainerID(data []byte) string {
+	return containerIDRegex.FindString(string(data))
+}
+
+// AgentProcessID returns the stable per-process ID for this agent.
+func AgentProcessID() string { return processID }
+
 var (
 	// Agent info and lifecycle metrics.
 
 
@@ -0,0 +1,86 @@
+package metrics
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestParseCgroupForContainerID(t *testing.T) {
+	// 64-hex fixtures taken from real-world cgroup paths.
+	const k8sCri = "3bf3c5a2e4d8f9c0a1b2c3d4e5f6789012345678abcdef0123456789abcdef01"
+	const docker = "abcdef0123456789abcdef0123456789abcdef0123456789abcdef0123456789"
+	const criO = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef"
+
+	tests := []struct {
+		name string
+		data string
+		want string
+	}{
+		{
+			name: "cgroup v1 k8s containerd",
+			data: "12:memory:/kubepods.slice/kubepods-burstable.slice/" +
+				"kubepods-burstable-pod1234abcd.slice/cri-containerd-" + k8sCri + ".scope\n",
+			want: k8sCri,
+		},
+		{
+			name: "cgroup v2 unified containerd",
+			data: "0::/kubepods.slice/kubepods-burstable.slice/" +
+				"kubepods-burstable-pod1234abcd.slice/cri-containerd-" + k8sCri + ".scope\n",
+			want: k8sCri,
+		},
+		{
+			name: "cgroup v1 docker",
+			data: "11:memory:/docker/" + docker + "\n",
+			want: docker,
+		},
+		{
+			name: "cgroup v1 cri-o",
+			data: "1:name=systemd:/kubepods/burstable/pod<uid>/crio-" + criO + ".scope\n",
+			want: criO,
+		},
+		{
+			name: "multiple lines picks the first 64-hex token",
+			data: "12:cpuset:/\n" +
+				"11:memory:/docker/" + docker + "\n" +
+				"0::/docker/" + docker + "\n",
+			want: docker,
+		},
+		{
+			name: "no container id (bare-metal / tests)",
+			data: "0::/user.slice/user-1000.slice/session-1.scope\n",
+			want: "",
+		},
+		{
+			name: "empty file",
+			data: "",
+			want: "",
+		},
+		{
+			name: "hex shorter than 64 is not matched (avoids partial shas)",
+			data: "0::/system.slice/some-service-abc123def.scope\n",
+			want: "",
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got := parseCgroupForContainerID([]byte(tt.data))
+			assert.Equal(t, tt.want, got)
+		})
+	}
+}
+
+func TestDetectContainerID_MissingFile(t *testing.T) {
+	// Non-existent path — e.g., macOS dev or sandboxed environments.
+	got := detectContainerID("/proc/does-not-exist/cgroup")
+	assert.Empty(t, got, "missing file must return empty, not panic")
+}
+
+func TestAgentProcessID_Stable(t *testing.T) {
+	// Whatever initProcessID() chose at package init, AgentProcessID() must
+	// return the same value on every call within the process lifetime.
+	a := AgentProcessID()
+	b := AgentProcessID()
+	assert.NotEmpty(t, a)
+	assert.Equal(t, a, b, "AgentProcessID must be stable across calls")
+}
@@ -7,26 +7,58 @@ import (
 
 	"github.com/prometheus/client_golang/prometheus"
 	dto "github.com/prometheus/client_model/go"
-	"google.golang.org/protobuf/proto"
 )
 
 const (
 	// StaleResultTimeout is how long a pushed result remains valid.
 	// Results older than this are skipped during collection.
 	StaleResultTimeout = 60 * time.Second
 
-	labelTunnelNode = "tunnel_node"
-	labelAgent      = "agent"
-	labelProjectID  = "project_id"
+	labelTunnelNode     = "tunnel_node"
+	labelAgent          = "agent" // Deprecated alias for conn_id; retained for dashboard compatibility.
+	labelConnID         = "conn_id"
+	labelAgentProcessID = "agent_process_id"
+	labelProjectID      = "project_id"
+
+	// connUptimeMetric is the first-party per-connection uptime metric emitted
+	// by the ReexportCollector (computed from StoreResult.RegisteredAt). Unlike
+	// tunnel_agent_uptime_seconds — which is re-exported from the agent and
+	// reports the agent *process* uptime duplicated across every conn_id — this
+	// metric reflects the lifetime of a single CONNECT-IP session.
+	connUptimeMetric = "tunnel_connection_uptime_seconds"
 )
 
+// targetLabelNames is the canonical order of the labels we inject on every
+// metric emitted by this collector (both re-exported agent metrics and the
+// first-party conn uptime gauge). Single source of truth so the connUptimeDesc
+// and per-result label-value slice stay in lock-step.
+var targetLabelNames = []string{
+	labelTunnelNode,
+	labelAgent,
+	labelConnID,
+	labelAgentProcessID,
+	labelProjectID,
+}
+
+func targetLabelValues(t StoreTarget) []string {
+	return []string{
+		t.TunnelNode,
+		t.AgentName, // legacy "agent" value; today always equal to ConnID on the server side
+		t.ConnID,
+		t.AgentProcessID,
+		t.ProjectID,
+	}
+}
+
 // ReexportCollector implements prometheus.Collector by iterating over pushed
-// agent metrics and re-emitting them with tunnel_node, agent, and project_id
-// labels injected. It should be registered with the tunnelproxy's Prometheus
-// registry so agent metrics appear on the tunnelproxy's /metrics endpoint.
+// agent metrics and re-emitting them with tunnel_node, agent, conn_id,
+// agent_process_id, and project_id labels injected. It should be registered
+// with the tunnelproxy's Prometheus registry so agent metrics appear on the
+// tunnelproxy's /metrics endpoint.
 type ReexportCollector struct {
-	store  *MetricsStore
-	prefix string
+	store          *MetricsStore
+	prefix         string
+	connUptimeDesc *prometheus.Desc
 }
 
 // ReexportOption configures a ReexportCollector.
@@ -47,6 +79,12 @@ func NewReexportCollector(store *MetricsStore, opts ...ReexportOption) *Reexport
 	for _, o := range opts {
 		o(c)
 	}
+	c.connUptimeDesc = prometheus.NewDesc(
+		c.prefix+connUptimeMetric,
+		"Seconds since this tunnel connection was registered with the tunnelproxy.",
+		targetLabelNames,
+		nil,
+	)
 	return c
 }
 
@@ -60,25 +98,34 @@ func (c *ReexportCollector) Describe(ch chan<- *prometheus.Desc) {}
 func (c *ReexportCollector) Collect(ch chan<- prometheus.Metric) {
 	now := time.Now()
 	c.store.ForEachResult(func(connID string, result *StoreResult) bool {
+		values := targetLabelValues(result.Target)
+		// Guard tolerates tests that populate store.results directly, bypassing
+		// Register. In production Register always stamps RegisteredAt.
+		if !result.RegisteredAt.IsZero() {
+			ch <- prometheus.MustNewConstMetric(
+				c.connUptimeDesc,
+				prometheus.GaugeValue,
+				now.Sub(result.RegisteredAt).Seconds(),
+				values...,
+			)
+		}
 		if now.Sub(result.PushedAt) > StaleResultTimeout {
 			return true
 		}
-		c.collectResult(ch, result)
+		c.collectResult(ch, result, values)
 		return true
 	})
 }
 
-func (c *ReexportCollector) collectResult(ch chan<- prometheus.Metric, result *StoreResult) {
-	extraLabels := []*dto.LabelPair{
-		{Name: proto.String(labelTunnelNode), Value: proto.String(result.Target.TunnelNode)},
-		{Name: proto.String(labelAgent), Value: proto.String(result.Target.AgentName)},
-		{Name: proto.String(labelProjectID), Value: proto.String(result.Target.ProjectID)},
-	}
-
+func (c *ReexportCollector) collectResult(
+	ch chan<- prometheus.Metric,
+	result *StoreResult,
+	targetValues []string,
+) {
 	for name, family := range result.Families {
 		prefixedName := c.prefix + name
 		for _, m := range family.Metric {
-			pm, err := c.toPrometheusMetric(prefixedName, family.GetType(), m, extraLabels)
+			pm, err := c.toPrometheusMetric(prefixedName, family.GetType(), m, targetValues)
 			if err != nil {
 				slog.Debug("Skipping metric",
 					slog.String("name", prefixedName),
@@ -95,20 +142,17 @@ func (c *ReexportCollector) toPrometheusMetric(
 	name string,
 	mtype dto.MetricType,
 	m *dto.Metric,
-	extraLabels []*dto.LabelPair,
+	targetValues []string,
 ) (prometheus.Metric, error) {
-	// Copy labels to avoid mutating the protobuf message's backing array.
 	existing := m.GetLabel()
-	allLabels := make([]*dto.LabelPair, 0, len(existing)+len(extraLabels))
-	allLabels = append(allLabels, existing...)
-	allLabels = append(allLabels, extraLabels...)
-
-	labelNames := make([]string, len(allLabels))
-	labelValues := make([]string, len(allLabels))
-	for i, lp := range allLabels {
-		labelNames[i] = lp.GetName()
-		labelValues[i] = lp.GetValue()
+	labelNames := make([]string, 0, len(existing)+len(targetLabelNames))
+	labelValues := make([]string, 0, len(existing)+len(targetLabelNames))
+	for _, lp := range existing {
+		labelNames = append(labelNames, lp.GetName())
+		labelValues = append(labelValues, lp.GetValue())
 	}
+	labelNames = append(labelNames, targetLabelNames...)
+	labelValues = append(labelValues, targetValues...)
 
 	desc := prometheus.NewDesc(name, "Re-exported agent metric.", labelNames, nil)
Original file line number	Diff line number	Diff line change
`@@ -30,6 +30,7 @@ import (`
`30`	`30`	`alog "github.com/apoxy-dev/apoxy/pkg/log"`
`31`	`31`	`"github.com/apoxy-dev/apoxy/pkg/tunnel/bfdl"`
`32`	`32`	`tunnelconn "github.com/apoxy-dev/apoxy/pkg/tunnel/connection"`
	`33`	`+ "github.com/apoxy-dev/apoxy/pkg/tunnel/metrics"`
`33`	`34`	`"github.com/apoxy-dev/apoxy/pkg/tunnel/router"`
`34`	`35`	`)`
`35`	`36`
`@@ -292,6 +293,7 @@ func (d *TunnelDialer) Dial(`
`292`	`293`	`for k, v := range options.labels {`
`293`	`294`	`q.Add("label."+k, v)`
`294`	`295`	`}`
	`296`	`+ q.Add(metrics.QueryParamAgentProcessID, metrics.AgentProcessID())`
`295`	`297`	`addrUrl.RawQuery = q.Encode()`
`296`	`298`
`297`	`299`	`tmpl, err := uritemplate.New(addrUrl.String())`