Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 40 additions & 8 deletions internal/gitclone/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -320,6 +320,17 @@ func WithReadLockReturn[T any](repo *Repository, fn func() (T, error)) (T, error
return fn()
}

// ResetToEmpty transitions the repository back to StateEmpty so that a
// subsequent call to TryStartCloning can re-attempt the clone. Use this
// when a restored snapshot turns out to be corrupt or empty and needs to
// be replaced with a fresh clone.
func (r *Repository) ResetToEmpty() {
r.mu.Lock()
defer r.mu.Unlock()
r.state = StateEmpty
r.lastFetch = time.Time{}
}

// TryStartCloning atomically transitions the repository from StateEmpty to
// StateCloning. Returns true if this goroutine won the transition and should
// proceed with the clone/restore; false if another goroutine already claimed it.
Expand Down Expand Up @@ -466,17 +477,17 @@ func configureMirror(ctx context.Context, repoPath string, packThreads int) erro
return nil
}

// cloneTimeout bounds `git clone --mirror` so a stuck clone cannot block
// CloneTimeout bounds `git clone --mirror` so a stuck clone cannot block
// the repo indefinitely. This is deliberately generous: large repos may
// take 10-20 minutes for GitHub to compute the server-side pack.
const cloneTimeout = 30 * time.Minute
const CloneTimeout = 30 * time.Minute

func (r *Repository) executeClone(ctx context.Context) error {
if err := os.MkdirAll(filepath.Dir(r.path), 0o750); err != nil {
return errors.Wrap(err, "create clone directory")
}

cloneCtx, cancel := context.WithTimeout(ctx, cloneTimeout)
cloneCtx, cancel := context.WithTimeout(ctx, CloneTimeout)
defer cancel()

config := DefaultGitTuningConfig()
Expand Down Expand Up @@ -525,6 +536,19 @@ func (r *Repository) Fetch(ctx context.Context) error {
// for catch-up fetches after snapshot restore where the delta may be large and
// the default fetchTimeout is too short.
func (r *Repository) FetchWithTimeout(ctx context.Context, timeout time.Duration) error {
return r.fetchInternal(ctx, timeout, true)
}

// FetchLenient fetches from upstream with the given timeout but without the
// low-speed transfer check. Use this for post-restore catch-up fetches where
// the delta may be very large and GitHub's server-side pack computation can
// stall at near-zero transfer rate for minutes — the same situation that
// executeClone handles by omitting lowSpeedLimit.
func (r *Repository) FetchLenient(ctx context.Context, timeout time.Duration) error {
return r.fetchInternal(ctx, timeout, false)
}

func (r *Repository) fetchInternal(ctx context.Context, timeout time.Duration, enforceSpeedLimit bool) error {
select {
case <-r.fetchSem:
defer func() {
Expand All @@ -547,12 +571,20 @@ func (r *Repository) FetchWithTimeout(ctx context.Context, timeout time.Duration

config := DefaultGitTuningConfig()

args := []string{
"-C", r.path,
"-c", "http.postBuffer=" + strconv.Itoa(config.PostBuffer),
}
if enforceSpeedLimit {
args = append(args,
"-c", "http.lowSpeedLimit="+strconv.Itoa(config.LowSpeedLimit),
"-c", "http.lowSpeedTime="+strconv.Itoa(int(config.LowSpeedTime.Seconds())),
)
}
args = append(args, "fetch", "--prune", "--prune-tags")

// #nosec G204 - r.path is controlled by us
cmd, err := r.gitCommand(fetchCtx, "-C", r.path,
"-c", "http.postBuffer="+strconv.Itoa(config.PostBuffer),
"-c", "http.lowSpeedLimit="+strconv.Itoa(config.LowSpeedLimit),
"-c", "http.lowSpeedTime="+strconv.Itoa(int(config.LowSpeedTime.Seconds())),
"fetch", "--prune", "--prune-tags")
cmd, err := r.gitCommand(fetchCtx, args...)
if err != nil {
return errors.Wrap(err, "create git command")
}
Expand Down
61 changes: 36 additions & 25 deletions internal/strategy/git/git.go
Original file line number Diff line number Diff line change
Expand Up @@ -133,13 +133,13 @@ func New(
}

start := time.Now()
if err := repo.Fetch(ctx); err != nil {
if err := repo.FetchLenient(ctx, gitclone.CloneTimeout); err != nil {
logger.ErrorContext(ctx, "Startup fetch failed for existing repo", "upstream", repo.UpstreamURL(), "error", err,
"duration", time.Since(start))
} else {
logger.InfoContext(ctx, "Startup fetch completed for existing repo", "upstream", repo.UpstreamURL(),
"duration", time.Since(start))
continue
}
logger.InfoContext(ctx, "Startup fetch completed for existing repo", "upstream", repo.UpstreamURL(),
"duration", time.Since(start))

postRefs, err := repo.GetLocalRefs(ctx)
if err != nil {
Expand Down Expand Up @@ -459,31 +459,42 @@ func (s *Strategy) startClone(ctx context.Context, repo *gitclone.Repository) {
if err := s.tryRestoreSnapshot(ctx, repo); err != nil {
logger.InfoContext(ctx, "Mirror snapshot restore failed, falling back to clone", "upstream", upstream, "error", err)
} else {
// Mirror snapshot restored successfully. The bare mirror is immediately
// servable — mark ready and let background fetch handle freshening.
repo.MarkReady()

if err := s.cleanupSpools(upstream); err != nil {
logger.WarnContext(ctx, "Failed to clean up spools", "upstream", upstream, "error", err)
}
logger.InfoContext(ctx, "Mirror snapshot restored, fetching to freshen", "upstream", upstream)

// Fetch with a generous timeout and no low-speed check: mirror
// snapshots can be hours old, so the delta may be very large and
// GitHub's server-side pack computation can stall at near-zero
// transfer for minutes (same as initial clone).
//
// State remains StateCloning until fetch succeeds so that
// concurrent requests (via ensureCloneReady) block rather than
// serving from a potentially empty or stale mirror.
if err := repo.FetchLenient(ctx, gitclone.CloneTimeout); err != nil {
logger.WarnContext(ctx, "Post-restore fetch failed, discarding snapshot and falling back to clone",
"upstream", upstream, "error", err)
// The restored snapshot may be corrupt or empty. Remove it and
// fall through to a fresh clone so we don't re-upload bad data.
repo.ResetToEmpty()
if rmErr := os.RemoveAll(repo.Path()); rmErr != nil {
logger.WarnContext(ctx, "Failed to remove corrupt mirror", "upstream", upstream, "error", rmErr)
}
} else {
repo.MarkReady()

logger.InfoContext(ctx, "Mirror snapshot restored, serving immediately", "upstream", upstream)
if err := s.cleanupSpools(upstream); err != nil {
logger.WarnContext(ctx, "Failed to clean up spools", "upstream", upstream, "error", err)
}

// Fetch synchronously so the mirror is fresh before we serve from it.
// Mirror snapshots can be hours old; serving stale data defeats the
// purpose of the cache. Call repo.Fetch directly instead of
// backgroundFetch, which would skip because MarkReady sets lastFetch.
if err := repo.Fetch(ctx); err != nil {
logger.WarnContext(ctx, "Post-restore fetch failed, serving from snapshot", "upstream", upstream, "error", err)
}
logger.InfoContext(ctx, "Post-restore fetch completed, serving", "upstream", upstream)

if s.config.SnapshotInterval > 0 {
s.scheduleSnapshotJobs(repo)
}
if s.config.RepackInterval > 0 {
s.scheduleRepackJobs(repo)
if s.config.SnapshotInterval > 0 {
s.scheduleSnapshotJobs(repo)
}
if s.config.RepackInterval > 0 {
s.scheduleRepackJobs(repo)
}
return
}
return
}

logger.InfoContext(ctx, "Starting clone", "upstream", upstream, "path", repo.Path())
Expand Down