# Hierarchical Chain-of-Thought Training

Fine-tune Qwen3-0.6B on the OpenMathReasoning Hierarchical CoT dataset using `HCotTrainer`.

## Setup

Clone the repo (Colab) or configure `sys.path` so that `model` and `training` packages are importable.

In [None]:
import sys, os

# When running in Colab, clone the repo and add lib/ to the path
if "google.colab" in sys.modules:
    if not os.path.exists("cs224n-final-project"):
        !git clone https://github.com/anujjamwal/cs224n-final-project.git
    sys.path.insert(0, "cs224n-final-project/lib")
else:
    # Local: notebook lives inside lib/ already
    sys.path.insert(0, os.path.dirname(os.path.abspath("__file__")))

In [4]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from datasets import load_dataset

from model import generate, masks
from model.model import THOUGHT_TOKEN, SOLUTION_TOKEN, RETURN_TOKEN, SPECIAL_TOKENS
from training.trainer import HCotTrainer

## Load Model and Tokenizer

In [5]:
MODEL_NAME = "Qwen/Qwen3-0.6B"

base_model = AutoModelForCausalLM.from_pretrained(
  MODEL_NAME,
  dtype=torch.bfloat16,
  device_map='auto'
)
THOUGHT_TOKEN = "[THOUGHT]"
SOLUTION_TOKEN = "[SOLUTION]"
RETURN_TOKEN = "[RETURN]"
SPECIAL_TOKENS = [THOUGHT_TOKEN, SOLUTION_TOKEN, RETURN_TOKEN]
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.add_special_tokens(
    {"additional_special_tokens": SPECIAL_TOKENS}
)
base_model.resize_token_embeddings(len(tokenizer))
model = base_model

config.json:   0%|          | 0.00/726 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.50G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/311 [00:00<?, ?it/s]



generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

## Load and Tokenize Dataset

In [8]:
DATASET_NAME = "anujjamwal/OpenMathReasoning-Sampled-Hierarchical-Cot"
MAX_SEQ_LEN = 2048

dataset = load_dataset(DATASET_NAME, split="train").filter(lambda ex: len(ex['hierarchical_cot']) > 50)
print(f"Dataset size: {len(dataset)}")
print(f"Columns: {dataset.column_names}")
print(dataset[0].keys())

Filter:   0%|          | 0/13 [00:00<?, ? examples/s]

Dataset size: 12
Columns: ['id', 'question', 'expected_answer', 'problem_source', 'generated_solution', 'pass_rate_72b_tir', 'used_in_kaggle', 'hierarchical_cot', 'hierarchical_cot_raw']
dict_keys(['id', 'question', 'expected_answer', 'problem_source', 'generated_solution', 'pass_rate_72b_tir', 'used_in_kaggle', 'hierarchical_cot', 'hierarchical_cot_raw'])


In [9]:
def tokenize(example):
    """Build training input from question + hierarchical_cot."""
    text = f"{example['question']}\n{example['hierarchical_cot']}{tokenizer.eos_token}"
    encoding = tokenizer(
        text,
        truncation=True,
        max_length=MAX_SEQ_LEN,
        padding="max_length",
    )
    encoding["labels"] = encoding["input_ids"].copy()
    return encoding

tokenized_dataset = dataset.map(tokenize, remove_columns=dataset.column_names)
tokenized_dataset.set_format("torch")
print(tokenized_dataset[0].keys())
print(f"Sample input length: {tokenized_dataset[0]['input_ids'].shape}")

Map:   0%|          | 0/12 [00:00<?, ? examples/s]

dict_keys(['input_ids', 'attention_mask', 'labels'])
Sample input length: torch.Size([2048])


## Train

In [10]:
training_args = TrainingArguments(
    output_dir="./hcot-qwen3-0.6b",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=2e-5,
    weight_decay=0.01,
    warmup_ratio=0.1,
    bf16=True,
    logging_steps=1,
    save_strategy="epoch",
    report_to="none",
)

warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.


In [13]:
class HCotMaskBuilder(masks.MaterialisedMaskMixin):
    def __init__(self, tokenizer):
        self.thought_token_id = tokenizer.convert_tokens_to_ids(THOUGHT_TOKEN)
        self.solution_token_id = tokenizer.convert_tokens_to_ids(SOLUTION_TOKEN)
        self.return_token_id = tokenizer.convert_tokens_to_ids(RETURN_TOKEN)

    def __call__(self, input_ids, padding_mask):
        return self._build_hierarchical_mask(input_ids=input_ids, padding_mask=padding_mask)

In [14]:
trainer = HCotTrainer(
    attention_mask_func=HCotMaskBuilder(tokenizer),
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    processing_class=tokenizer,
)

trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.


RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
trainer.save_model("./hcot-qwen3-0.6b/final")
tokenizer.save_pretrained("./hcot-qwen3-0.6b/final")

## Verify Generation

In [None]:
model.eval()

prompt = dataset[0]["question"]
inputs = tokenizer([prompt], return_tensors="pt").to(model.model.device)

thought_token_id = tokenizer.convert_tokens_to_ids(THOUGHT_TOKEN)
solution_token_id = tokenizer.convert_tokens_to_ids(SOLUTION_TOKEN)
return_token_id = tokenizer.convert_tokens_to_ids(RETURN_TOKEN)

gen_out = model.generate(
    **inputs,
    thought_token_id=thought_token_id,
    solution_token_id=solution_token_id,
    return_token_id=return_token_id,
    eos_token_id=tokenizer.eos_token_id,
    use_cache=False,
    custom_generate=generate.generate,
)

print(tokenizer.batch_decode(gen_out))