diff --git a/cmd/task/main.go b/cmd/task/main.go index 10352039..4cd7d8ce 100644 --- a/cmd/task/main.go +++ b/cmd/task/main.go @@ -2133,6 +2133,76 @@ Examples: } rootCmd.AddCommand(upgradeCmd) + // Doctor command - diagnose the agent server's GitHub auth health. + doctorCmd := &cobra.Command{ + Use: "doctor", + Short: "Diagnose agent server health (GitHub auth & rate limits)", + Long: `Checks the local GitHub CLI authentication used by agents and warns about +conditions that cause shared GraphQL bucket exhaustion across agent servers: + + - gh not installed or not logged in (GitHub operations silently fail) + - an expired/revoked token (401 Bad credentials) + - authentication as a PERSONAL account, whose 5,000 pt/hr GraphQL limit is + shared per-user across every server authed as that account + - low remaining GraphQL headroom + +Each agent server should authenticate with its OWN GitHub App installation +token (a bot identity), which gets an independent GraphQL bucket. + +Exits non-zero on hard errors (gh missing, logged out, expired token). Pass +--strict to also exit non-zero on warnings (e.g. personal-account auth), so a +fleet sweep like 'for s in ...; do ssh $s ty doctor --strict; done' can flag +servers programmatically.`, + Run: func(cmd *cobra.Command, args []string) { + strict, _ := cmd.Flags().GetBool("strict") + fmt.Println(boldStyle.Render("TaskYou Doctor")) + fmt.Println(dimStyle.Render("Checking GitHub authentication...")) + fmt.Println() + + status := github.CheckAuth(context.Background()) + if status.Err != nil { + fmt.Println(warnStyle.Render("⚠ Could not fully probe gh: " + status.Err.Error())) + fmt.Println() + } + + findings := status.Findings() + hasError := false + for _, f := range findings { + var icon, msg string + switch f.Severity { + case github.SeverityOK: + icon = successStyle.Render("✓") + msg = f.Message + case github.SeverityWarn: + icon = warnStyle.Render("⚠") + msg = warnStyle.Render(f.Message) + case github.SeverityError: + icon = errorStyle.Render("✗") + msg = errorStyle.Render(f.Message) + hasError = true + } + fmt.Printf("%s %s\n", icon, msg) + if f.Detail != "" { + fmt.Println(dimStyle.Render(" " + f.Detail)) + } + } + + fmt.Println() + if hasError || status.HasProblems() { + fmt.Println(dimStyle.Render("Tip: provision this server with its own GitHub App installation token,")) + fmt.Println(dimStyle.Render("mirroring the offerlab-devs[bot] pattern, for an independent rate-limit bucket.")) + } else { + fmt.Println(successStyle.Render("All checks passed.")) + } + + if hasError || (strict && status.HasProblems()) { + os.Exit(1) + } + }, + } + doctorCmd.Flags().Bool("strict", false, "Exit non-zero on warnings too (e.g. personal-account auth), for fleet health sweeps") + rootCmd.AddCommand(doctorCmd) + // Settings command settingsCmd := &cobra.Command{ Use: "settings", diff --git a/internal/executor/executor.go b/internal/executor/executor.go index 93baa606..d1177f1b 100644 --- a/internal/executor/executor.go +++ b/internal/executor/executor.go @@ -1398,6 +1398,20 @@ Work on this task until completion. When you're done or need input: - If you see a path like .task-worktrees/, you're in the right place - The parent repo does NOT exist for you - only this worktree does +🐙 GITHUB CLI (gh) — CONSERVE THE SHARED GRAPHQL BUCKET: + GitHub's GraphQL rate limit (5,000 points/hr) is PER-USER and is shared by + every agent server authenticated as the same account. It exhausts easily. + + - Prefer REST for PR reads — it has a SEPARATE 5,000/hr bucket: + gh pr view --json ... ❌ (GraphQL-backed) + gh api repos/{owner}/{repo}/pulls/{n} ✅ (REST) + - NEVER busy-poll CI with "gh pr checks" in a loop. Instead use: + gh run watch ✅ (blocks server-side, no polling) + or poll REST check-runs with backoff: + gh api repos/{owner}/{repo}/commits/{sha}/check-runs + - If you see "GraphQL bucket is exhausted", switch to the REST equivalents + above and back off — do not retry the GraphQL call in a tight loop. + The task system will automatically detect your status. ═══════════════════════════════════════════════════════════════` } diff --git a/internal/executor/executor_test.go b/internal/executor/executor_test.go index cc904e0c..9d8bdfcb 100644 --- a/internal/executor/executor_test.go +++ b/internal/executor/executor_test.go @@ -1933,3 +1933,21 @@ func TestCleanupWorktreeNonWorktreeTask(t *testing.T) { t.Error("expected settings.local.json to be removed") } } + +func TestBuildSystemInstructions_GitHubGuidance(t *testing.T) { + instructions := (&Executor{}).buildSystemInstructions() + + // The GitHub guidance must steer agents away from the shared GraphQL bucket. + wants := []string{ + "GITHUB CLI", + "PER-USER", + "gh pr checks", + "gh run watch", + "REST", + } + for _, want := range wants { + if !strings.Contains(instructions, want) { + t.Errorf("buildSystemInstructions() missing %q", want) + } + } +} diff --git a/internal/github/auth.go b/internal/github/auth.go new file mode 100644 index 00000000..1987b333 --- /dev/null +++ b/internal/github/auth.go @@ -0,0 +1,294 @@ +package github + +import ( + "context" + "encoding/json" + "fmt" + "os/exec" + "strings" + "time" +) + +// AccountType classifies the identity that `gh` is authenticated as. +type AccountType string + +const ( + // AccountTypeUnknown means we could not determine the identity. + AccountTypeUnknown AccountType = "unknown" + // AccountTypePersonal is a regular GitHub user account (e.g. via `gh auth login`). + // Personal accounts share a single 5,000 pt/hr GraphQL bucket PER-USER across + // every machine authenticated as that user — the root cause of bucket exhaustion. + AccountTypePersonal AccountType = "personal" + // AccountTypeApp is a GitHub App / bot identity (login ends in "[bot]", + // API type "Bot"). Each App installation token gets its OWN GraphQL bucket, + // so it does not contend with other servers. + AccountTypeApp AccountType = "app" +) + +// Severity ranks a diagnostic finding. +type Severity string + +const ( + SeverityOK Severity = "ok" + SeverityWarn Severity = "warn" + SeverityError Severity = "error" +) + +// Finding is a single diagnostic result from the GitHub auth check. +type Finding struct { + Severity Severity + Message string + Detail string // optional remediation hint +} + +// AuthStatus is the result of inspecting the local `gh` authentication. +type AuthStatus struct { + GHInstalled bool + LoggedIn bool + TokenValid bool // false when the token is expired/revoked (401 Bad credentials) + + Account string + AccountType AccountType + + // GraphQL rate-limit headroom. Remaining/Limit are -1 when unknown. + GraphQLRemaining int + GraphQLLimit int + GraphQLResetAt time.Time + + // Err holds an unexpected error encountered while probing (not auth state). + Err error +} + +// ghUserResponse is the subset of `gh api user` we care about. +type ghUserResponse struct { + Login string `json:"login"` + Type string `json:"type"` +} + +// rateLimitResponse mirrors `gh api rate_limit`. +type rateLimitResponse struct { + Resources struct { + GraphQL struct { + Limit int `json:"limit"` + Remaining int `json:"remaining"` + Reset int64 `json:"reset"` + } `json:"graphql"` + } `json:"resources"` +} + +// CheckAuth inspects the local `gh` CLI authentication and rate-limit headroom. +// It never returns an error for ordinary auth states (not installed, logged out, +// expired token); those are reported via the returned AuthStatus fields. +func CheckAuth(ctx context.Context) AuthStatus { + status := AuthStatus{ + GraphQLRemaining: -1, + GraphQLLimit: -1, + } + + if _, err := exec.LookPath("gh"); err != nil { + return status + } + status.GHInstalled = true + + // Probe the authenticated identity via the REST user endpoint. This both + // confirms the token is valid and tells us whether it's a personal account + // or a bot/App identity. + userCtx, cancel := context.WithTimeout(ctx, 10*time.Second) + defer cancel() + cmd := exec.CommandContext(userCtx, "gh", "api", "user", "--jq", "{login: .login, type: .type}") + out, err := cmd.Output() + if err != nil { + stderr := "" + if exitErr, ok := err.(*exec.ExitError); ok { + stderr = string(exitErr.Stderr) + } + st := classifyUserErr(stderr) + status.LoggedIn = st.loggedIn + status.TokenValid = st.tokenValid + if st.unknown { + // Unknown failure (network, timeout). Surface it but don't crash. + status.Err = fmt.Errorf("gh api user failed: %w", err) + } + return status + } + + status.LoggedIn = true + status.TokenValid = true + + var user ghUserResponse + if jsonErr := json.Unmarshal(out, &user); jsonErr == nil { + status.Account = user.Login + status.AccountType = classifyAccount(user.Login, user.Type) + } + + // Best-effort GraphQL headroom probe. Failures leave the -1 sentinels. + rl := fetchRateLimit(ctx) + if rl != nil { + status.GraphQLRemaining = rl.Resources.GraphQL.Remaining + status.GraphQLLimit = rl.Resources.GraphQL.Limit + if rl.Resources.GraphQL.Reset > 0 { + status.GraphQLResetAt = time.Unix(rl.Resources.GraphQL.Reset, 0) + } + } + + return status +} + +// fetchRateLimit queries the GraphQL rate-limit bucket. Returns nil on failure. +func fetchRateLimit(ctx context.Context) *rateLimitResponse { + rlCtx, cancel := context.WithTimeout(ctx, 5*time.Second) + defer cancel() + out, err := exec.CommandContext(rlCtx, "gh", "api", "rate_limit").Output() + if err != nil { + return nil + } + var rl rateLimitResponse + if err := json.Unmarshal(out, &rl); err != nil { + return nil + } + return &rl +} + +// classifyAccount decides whether an identity is a personal user or a bot/App. +// GitHub App installation tokens authenticate as a Bot user whose login ends in +// "[bot]" and whose API type is "Bot". +func classifyAccount(login, apiType string) AccountType { + if strings.EqualFold(apiType, "Bot") || strings.HasSuffix(login, "[bot]") { + return AccountTypeApp + } + if login != "" { + return AccountTypePersonal + } + return AccountTypeUnknown +} + +// userErrState is the auth state inferred from a FAILED `gh api user` call. +// +// stderr pattern → state +// ───────────────────────────────────────────── +// "Bad credentials"/401 → loggedIn, !tokenValid (token expired/revoked) +// "not logged in"/… → !loggedIn (no auth configured) +// anything else → unknown (network/timeout — caller sets Err) +type userErrState struct { + loggedIn bool + tokenValid bool + unknown bool // stderr matched neither known pattern +} + +// classifyUserErr maps `gh api user` stderr to an auth state. Pure and table-tested +// so a future change to gh's wording is caught rather than silently mis-routed. +func classifyUserErr(stderr string) userErrState { + switch { + case isBadCredentials(stderr): + // Config present (logged in) but the token is expired/revoked. + return userErrState{loggedIn: true, tokenValid: false} + case isNotLoggedIn(stderr): + return userErrState{loggedIn: false} + default: + return userErrState{unknown: true} + } +} + +func isBadCredentials(stderr string) bool { + s := strings.ToLower(stderr) + return strings.Contains(s, "bad credentials") || + strings.Contains(s, "401") || + strings.Contains(s, "requires authentication") +} + +func isNotLoggedIn(stderr string) bool { + s := strings.ToLower(stderr) + return strings.Contains(s, "not logged") || + strings.Contains(s, "no accounts") || + strings.Contains(s, "authentication required") || + strings.Contains(s, "to get started with github cli") || + strings.Contains(s, "gh auth login") +} + +// graphQLLowThreshold is the remaining-points level below which `ty doctor` warns +// the OPERATOR that the bucket is close to exhaustion. +// +// This is intentionally higher than rateLimitThreshold (200) in pr.go, which gates +// automatic batch PR fetches. The two serve different jobs and should not be merged: +// - rateLimitThreshold (200): the TUI's own self-throttle — stop spending budget. +// - graphQLLowThreshold (500): warn a human earlier, before automatic throttling +// kicks in, so they can act (e.g. re-provision a bot token) with headroom to spare. +const graphQLLowThreshold = 500 + +// Findings translates the auth status into ordered diagnostic findings. The most +// severe issues come first. This is the data `ty doctor` renders. +func (s AuthStatus) Findings() []Finding { + var findings []Finding + + if !s.GHInstalled { + return []Finding{{ + Severity: SeverityError, + Message: "GitHub CLI (gh) is not installed", + Detail: "Install gh so agents can interact with GitHub: https://cli.github.com", + }} + } + + if !s.LoggedIn { + return []Finding{{ + Severity: SeverityError, + Message: "gh is not logged in to any GitHub account", + Detail: "GitHub operations (gh pr ...) will silently fail. Authenticate this server with its own GitHub App installation token.", + }} + } + + if !s.TokenValid { + return []Finding{{ + Severity: SeverityError, + Message: "gh token is expired or revoked (401 Bad credentials)", + Detail: "GitHub operations are silently failing. Re-provision this server's GitHub App installation token.", + }} + } + + switch s.AccountType { + case AccountTypeApp: + findings = append(findings, Finding{ + Severity: SeverityOK, + Message: fmt.Sprintf("Authenticated as GitHub App / bot identity %q", s.Account), + Detail: "Bot identities get their own GraphQL bucket — no cross-server contention.", + }) + case AccountTypePersonal: + findings = append(findings, Finding{ + Severity: SeverityWarn, + Message: fmt.Sprintf("Authenticated as personal account %q", s.Account), + Detail: "GitHub's GraphQL limit is PER-USER. Every server authenticated as this same account shares ONE 5,000 pt/hr bucket and will exhaust it under load. Provision this server with its own GitHub App installation token (bot identity) instead.", + }) + default: + findings = append(findings, Finding{ + Severity: SeverityWarn, + Message: "Could not determine the GitHub account type", + Detail: "Run `gh api user` to inspect the authenticated identity.", + }) + } + + // GraphQL headroom. + if s.GraphQLRemaining >= 0 && s.GraphQLLimit > 0 { + msg := fmt.Sprintf("GraphQL bucket: %d/%d points remaining", s.GraphQLRemaining, s.GraphQLLimit) + if !s.GraphQLResetAt.IsZero() { + msg += fmt.Sprintf(" (resets %s)", s.GraphQLResetAt.Format(time.Kitchen)) + } + sev := SeverityOK + detail := "" + if s.GraphQLRemaining < graphQLLowThreshold { + sev = SeverityWarn + detail = "Bucket is nearly exhausted. Prefer REST for PR reads and avoid `gh pr checks` polling loops until it resets." + } + findings = append(findings, Finding{Severity: sev, Message: msg, Detail: detail}) + } + + return findings +} + +// HasProblems reports whether any finding is a warning or error. +func (s AuthStatus) HasProblems() bool { + for _, f := range s.Findings() { + if f.Severity != SeverityOK { + return true + } + } + return false +} diff --git a/internal/github/auth_test.go b/internal/github/auth_test.go new file mode 100644 index 00000000..1e975c56 --- /dev/null +++ b/internal/github/auth_test.go @@ -0,0 +1,184 @@ +package github + +import ( + "testing" + "time" +) + +func TestClassifyAccount(t *testing.T) { + tests := []struct { + name string + login string + apiType string + want AccountType + }{ + {"bot by type", "offerlab-agents[bot]", "Bot", AccountTypeApp}, + {"bot by login suffix only", "ci-runner[bot]", "", AccountTypeApp}, + {"bot type case-insensitive", "x", "bot", AccountTypeApp}, + {"personal user", "bborn", "User", AccountTypePersonal}, + {"personal no type", "someone", "", AccountTypePersonal}, + {"unknown empty", "", "", AccountTypeUnknown}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := classifyAccount(tt.login, tt.apiType); got != tt.want { + t.Errorf("classifyAccount(%q, %q) = %q, want %q", tt.login, tt.apiType, got, tt.want) + } + }) + } +} + +func TestIsBadCredentials(t *testing.T) { + yes := []string{ + "HTTP 401: Bad credentials (https://api.github.com/user)", + "gh: Bad credentials", + "This endpoint requires authentication", + "error: 401", + } + for _, s := range yes { + if !isBadCredentials(s) { + t.Errorf("isBadCredentials(%q) = false, want true", s) + } + } + no := []string{"", "some network error", "rate limit exceeded"} + for _, s := range no { + if isBadCredentials(s) { + t.Errorf("isBadCredentials(%q) = true, want false", s) + } + } +} + +func TestIsNotLoggedIn(t *testing.T) { + yes := []string{ + "You are not logged into any GitHub hosts. To get started with GitHub CLI, please run: gh auth login", + "no accounts found", + "authentication required", + } + for _, s := range yes { + if !isNotLoggedIn(s) { + t.Errorf("isNotLoggedIn(%q) = false, want true", s) + } + } + if isNotLoggedIn("Bad credentials") { + // Bad credentials means logged in but expired, not logged out. + // (isBadCredentials handles that case separately.) + } +} + +func TestClassifyUserErr(t *testing.T) { + tests := []struct { + name string + stderr string + wantLoggedIn bool + wantTokenValid bool + wantUnknown bool + }{ + {"expired token", "HTTP 401: Bad credentials", true, false, false}, + {"requires auth", "This endpoint requires authentication", true, false, false}, + {"logged out", "You are not logged into any GitHub hosts. Run: gh auth login", false, false, false}, + {"no accounts", "no accounts found", false, false, false}, + {"network error is unknown", "dial tcp: i/o timeout", false, false, true}, + {"empty is unknown", "", false, false, true}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := classifyUserErr(tt.stderr) + if got.loggedIn != tt.wantLoggedIn || got.tokenValid != tt.wantTokenValid || got.unknown != tt.wantUnknown { + t.Errorf("classifyUserErr(%q) = %+v, want loggedIn=%v tokenValid=%v unknown=%v", + tt.stderr, got, tt.wantLoggedIn, tt.wantTokenValid, tt.wantUnknown) + } + }) + } +} + +func TestFindings_NotInstalled(t *testing.T) { + s := AuthStatus{GHInstalled: false} + findings := s.Findings() + if len(findings) != 1 || findings[0].Severity != SeverityError { + t.Fatalf("expected single error finding, got %+v", findings) + } + if !s.HasProblems() { + t.Error("HasProblems() = false, want true") + } +} + +func TestFindings_LoggedOut(t *testing.T) { + s := AuthStatus{GHInstalled: true, LoggedIn: false} + findings := s.Findings() + if len(findings) != 1 || findings[0].Severity != SeverityError { + t.Fatalf("expected single error finding, got %+v", findings) + } +} + +func TestFindings_ExpiredToken(t *testing.T) { + s := AuthStatus{GHInstalled: true, LoggedIn: true, TokenValid: false} + findings := s.Findings() + if len(findings) != 1 || findings[0].Severity != SeverityError { + t.Fatalf("expected single error finding for expired token, got %+v", findings) + } +} + +func TestFindings_PersonalAccountWarns(t *testing.T) { + s := AuthStatus{ + GHInstalled: true, + LoggedIn: true, + TokenValid: true, + Account: "bborn", + AccountType: AccountTypePersonal, + GraphQLRemaining: 4800, + GraphQLLimit: 5000, + } + findings := s.Findings() + if len(findings) < 1 { + t.Fatal("expected at least one finding") + } + if findings[0].Severity != SeverityWarn { + t.Errorf("personal account finding severity = %q, want warn", findings[0].Severity) + } + if !s.HasProblems() { + t.Error("HasProblems() = false, want true for personal account") + } +} + +func TestFindings_AppAccountHealthy(t *testing.T) { + s := AuthStatus{ + GHInstalled: true, + LoggedIn: true, + TokenValid: true, + Account: "offerlab-agents[bot]", + AccountType: AccountTypeApp, + GraphQLRemaining: 5000, + GraphQLLimit: 5000, + GraphQLResetAt: time.Now().Add(time.Hour), + } + findings := s.Findings() + for _, f := range findings { + if f.Severity != SeverityOK { + t.Errorf("expected all OK findings for healthy bot, got %q: %s", f.Severity, f.Message) + } + } + if s.HasProblems() { + t.Error("HasProblems() = true, want false for healthy bot identity") + } +} + +func TestFindings_LowGraphQLHeadroomWarns(t *testing.T) { + s := AuthStatus{ + GHInstalled: true, + LoggedIn: true, + TokenValid: true, + Account: "offerlab-agents[bot]", + AccountType: AccountTypeApp, + GraphQLRemaining: 100, // below threshold + GraphQLLimit: 5000, + } + var sawWarn bool + for _, f := range s.Findings() { + if f.Severity == SeverityWarn { + sawWarn = true + } + } + if !sawWarn { + t.Error("expected a warning for low GraphQL headroom") + } +} diff --git a/internal/github/pr.go b/internal/github/pr.go index 3a301b0f..00aa17ff 100644 --- a/internal/github/pr.go +++ b/internal/github/pr.go @@ -338,6 +338,8 @@ func graphQLRateLimitRemaining() int { } // rateLimitThreshold is the minimum remaining GraphQL calls before we skip batch fetches. +// This is the TUI's automatic self-throttle. `ty doctor` warns the operator earlier, +// at the higher graphQLLowThreshold (500) in auth.go — see that constant for the rationale. const rateLimitThreshold = 200 // FetchAllPRsForRepo fetches all open and recently merged PRs for a repo in a single API call.