# Checkpoint Lister

This notebook lists and saves available revisions/checkpoints
(branches) from one or more Hugging Face repositories.

In [None]:
import re
from pathlib import Path
from huggingface_hub import list_repo_refs

# --- Parameters (edit these) ---
outdir = Path(".") / "checkpoint_lists" # dir where .txt files will be saved
INCLUDE_MAIN = True # include the 'main' branch
MOVE_MAIN_TO_END = True # move 'main' to the end of the list

_STEP_RE = re.compile(r"(?:^|[^0-9])step\s*([0-9]+)", re.IGNORECASE)

def _step_key(name="step"):
    """
    Extract a numeric step for sorting if present, else return a sentinel.
    Examples matched: 'step1000', 'step-2000', 'foo-step300'.
    """
    m = _STEP_RE.search(name)
    if m:
        try:
            return (int(m.group(1)), name)
        except ValueError:
            pass
    # Put non-step names after step-suffixed ones, but keep deterministic order by name
    return (10**12, name)

def sort_branches(branches, move_main_to_end):
    """Sort branches by 'step' numeric value if available, otherwise by name."""
    br = list(branches)
    br.sort(key=_step_key)
    if move_main_to_end and "main" in br:
        br = [b for b in br if b != "main"] + ["main"]
    return br

def fetch_repo_branches(repo_name, include_main=True, move_main_to_end=True):
    refs = list_repo_refs(repo_name)
    branches = [b.name for b in getattr(refs, "branches", [])]

    if not include_main:
        branches = [b for b in branches if b != "main"]
    ordered = sort_branches(branches=branches, move_main_to_end=move_main_to_end)
    return ordered

def default_filename(repo_name):
    safe = repo_name.replace("/", "_")
    return f"checkpoints_{safe}.txt"

def save_branches(branches, out_path):
    out_path.parent.mkdir(parents=True, exist_ok=True)
    with out_path.open("w", encoding="utf-8") as f:
        for b in branches:
            f.write(b + "\n")
    print(f"Saved {len(branches)} checkpoints to {out_path}")


  from .autonotebook import tqdm as notebook_tqdm


### Pythia

In [2]:
repo_name = "EleutherAI/pythia-1b"
out = list_repo_refs(repo_name)
branches = fetch_repo_branches(
    repo_name=repo_name, 
    include_main=INCLUDE_MAIN, 
    move_main_to_end=MOVE_MAIN_TO_END)
save_branches(
    branches=branches, 
    out_path=outdir / default_filename(repo_name))

Saved 155 checkpoints to checkpoint_lists/checkpoints_EleutherAI_pythia-1b.txt


In [3]:
repo_name = "EleutherAI/pythia-6.9b"
out = list_repo_refs(repo_name)
branches = fetch_repo_branches(
    repo_name=repo_name, 
    include_main=INCLUDE_MAIN, 
    move_main_to_end=MOVE_MAIN_TO_END)
save_branches(
    branches=branches, 
    out_path=outdir / default_filename(repo_name))

Saved 155 checkpoints to checkpoint_lists/checkpoints_EleutherAI_pythia-6.9b.txt


### OLMo

In [4]:
repo_name = "allenai/OLMo-1B-hf"
branches = fetch_repo_branches(
    repo_name=repo_name, 
    include_main=INCLUDE_MAIN, 
    move_main_to_end=MOVE_MAIN_TO_END)
save_branches(
    branches=branches, 
    out_path=outdir / default_filename(repo_name))

Saved 352 checkpoints to checkpoint_lists/checkpoints_allenai_OLMo-1B-hf.txt


In [5]:
repo_name = "allenai/OLMo-1B-0724-hf"
branches = fetch_repo_branches(
    repo_name=repo_name, 
    include_main=INCLUDE_MAIN, 
    move_main_to_end=MOVE_MAIN_TO_END)
save_branches(
    branches=branches, 
    out_path=outdir / default_filename(repo_name))

Saved 1447 checkpoints to checkpoint_lists/checkpoints_allenai_OLMo-1B-0724-hf.txt


In [6]:
repo_name = "allenai/OLMo-7B-0724-hf"
branches = fetch_repo_branches(
    repo_name=repo_name, 
    include_main=INCLUDE_MAIN, 
    move_main_to_end=MOVE_MAIN_TO_END)
save_branches(
    branches=branches, 
    out_path=outdir / default_filename(repo_name))

Saved 820 checkpoints to checkpoint_lists/checkpoints_allenai_OLMo-7B-0724-hf.txt


In [12]:
from huggingface_hub import list_repo_refs

# Get all branches (checkpoints) for the OLMo model
repo_name = "allenai/OLMo-2-1124-7B"
out = list_repo_refs(repo_name)
branches = [b.name for b in out.branches]

stage1 = [branch for branch in branches if branch.startswith("stage1")]
stage1.sort(key=lambda x: int(x.split("-")[1].split("step")[1]))  # Sort by checkpoint number

stage2_ing1 = [branch for branch in branches if branch.startswith("stage2") and "ingredient1" in branch]
stage2_ing1.sort(key=lambda x: int(x.split("-")[2].split("step")[1]))  # Sort by checkpoint number

stage2_ing2 = [branch for branch in branches if branch.startswith("stage2") and "ingredient2" in branch]
stage2_ing2.sort(key=lambda x: int(x.split("-")[2].split("step")[1]))  # Sort by checkpoint number

stage2_ing3 = [branch for branch in branches if branch.startswith("stage2") and "ingredient3" in branch]
stage2_ing3.sort(key=lambda x: int(x.split("-")[2].split("step")[1]))  # Sort by checkpoint number

sorted_branches = stage1 + stage2_ing1 + stage2_ing2 + stage2_ing3 + ["main"]

assert set(sorted_branches) == set(branches)
print(len(sorted_branches))
print(len(branches))

out_path= "checkpoint_lists" + default_filename(repo_name)
with open(out_path, "w") as f:
    for branch in sorted_branches:
        f.write(branch + "\n")

print(f"Saved {len(branches)} checkpoints to {txt_name}")

965
965
Saved 965 checkpoints to checkpoint_lists/checkpoints_bloom_1b1.txt


In [9]:
from huggingface_hub import list_repo_refs

# Get all branches (checkpoints) for the OLMo model
repo_name = "allenai/OLMo-2-0425-1B"
out = list_repo_refs(repo_name)
branches = [b.name for b in out.branches]

stage1 = [branch for branch in branches if branch.startswith("stage1")]
stage1.sort(key=lambda x: int(x.split("-")[1].split("step")[1]))  # Sort by checkpoint number

stage2_ing1 = [branch for branch in branches if branch.startswith("stage2") and "ingredient1" in branch]
stage2_ing1.sort(key=lambda x: int(x.split("-")[2].split("step")[1]))  # Sort by checkpoint number

stage2_ing2 = [branch for branch in branches if branch.startswith("stage2") and "ingredient2" in branch]
stage2_ing2.sort(key=lambda x: int(x.split("-")[2].split("step")[1]))  # Sort by checkpoint number

stage2_ing3 = [branch for branch in branches if branch.startswith("stage2") and "ingredient3" in branch]
stage2_ing3.sort(key=lambda x: int(x.split("-")[2].split("step")[1]))  # Sort by checkpoint number

sorted_branches = stage1 + stage2_ing1 + stage2_ing2 + stage2_ing3 + ["main"]

assert set(sorted_branches) == set(branches)
print(len(sorted_branches))
print(len(branches))
# Save to a text file
txt_name = "checkpoint_lists/checkpoints_olmo2_1B_april.txt"
with open(txt_name, "w") as f:
    for branch in sorted_branches:
        f.write(branch + "\n")

print(f"Saved {len(branches)} checkpoints to {txt_name}")

268
268
Saved 268 checkpoints to checkpoint_lists/checkpoints_olmo2_1B_april.txt


### BLOOM

In [None]:
from huggingface_hub import list_repo_refs
from transformers import AutoModelForCausalLM, AutoTokenizer


# Get all branches (checkpoints) for the OLMo model
repo_name = "bigscience/bloom-1b1-intermediate"
out = list_repo_refs(repo_name)
print(out)
branches = [t.name for t in out.tags if t.name.startswith("global_step")]
print(branches)
# branches.remove("main")  # Remove the main branch
branches.sort(key=lambda x: int(x.split("-")[0].split("global_step")[1]))  # Sort by checkpoint number

branches.append("main")
print(len(branches))
# Save to a text file
txt_name = "checkpoint_lists/checkpoints_bloom_1b1.txt"

with open(txt_name, "w") as f:
    for branch in branches:
        f.write(branch + "\n")

print(f"Saved {len(branches)} checkpoints to {txt_name}")

GitRefs(branches=[GitRefInfo(name='main', ref='refs/heads/main', target_commit='5f57bed76d881bf87b3dbfcf6abb23957bea6048')], converts=[], tags=[GitRefInfo(name='global_step1000', ref='refs/tags/global_step1000', target_commit='db7749f2f5a4cb47e092251d29ed7341ff647bab'), GitRefInfo(name='global_step10000', ref='refs/tags/global_step10000', target_commit='28de8f9c9edc6080f964e23ef4d978d538a16905'), GitRefInfo(name='global_step100000', ref='refs/tags/global_step100000', target_commit='15e08069b71d6d90c4d6d0cf5c435655b7a58952'), GitRefInfo(name='global_step200000', ref='refs/tags/global_step200000', target_commit='693f62167abb4c55c09c288504ccf58d2539a3ae'), GitRefInfo(name='global_step300000', ref='refs/tags/global_step300000', target_commit='e061bb33d130b5f1ce4dbe95c195930a2dcd124a'), GitRefInfo(name='global_step400000', ref='refs/tags/global_step400000', target_commit='a758d93ecb5f0e4d7390137a56e42b4c32988fcb'), GitRefInfo(name='global_step500000', ref='refs/tags/global_step500000', targ