In [None]:
import os
import pandas as pd
import torch
import yaml
from pathlib import Path
from tqdm.auto import tqdm

from chg.misc.prompt_tokenizer import PromptTokenizer
from chg.misc.format_data import create_dataset

In [3]:
with open("projects/causal_head_gating/config.yaml", "r") as f:
    config = yaml.safe_load(f)
directories = {k: Path(v) for k, v in config['directories'].items()}
os.environ['HF_HOME'] = str(directories['huggingface'])

from transformers import AutoTokenizer
from datasets import load_dataset

device = 0
dataset = load_dataset("nvidia/OpenMathInstruct-2", split='train_1M')

Using the latest cached version of the dataset since nvidia/OpenMathInstruct-2 couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /scratch/gpfs/JDC/andrew/huggingface/datasets/nvidia___open_math_instruct-2/default/0.0.0/469216e3f46f4dacf476b382e192485ea51a143e (last modified on Fri Apr 18 20:10:06 2025).


# Get problem lengths

In [4]:
# Use a single model tokenizer to determine approximate sequence lengths
model_name = 'meta-llama/Llama-3.2-3B-Instruct'
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
prompt_tokenizer = PromptTokenizer(tokenizer)

In [7]:
rows = []
for batch in tqdm(dataset):
    problem = batch['problem']
    solution = batch['generated_solution']
    answer = batch['expected_answer']

    text_tokens, loss_mask = prompt_tokenizer.tokenize(problem, solution)
    rows.append({
        'problem': problem,
        'solution': solution,
        'answer': answer,
        'length': len(text_tokens),
    })

  0%|          | 0/1000000 [00:00<?, ?it/s]

In [None]:
df_problems = pd.DataFrame(rows)
dirname = directories['save'] / 'datasets' / 'math'
dirname.mkdir(parents=True, exist_ok=True)
df_problems.to_parquet(dirname / 'OpenMathInstruct-2_train_1M.parquet')

# Tokenize

In [12]:
dirname = directories['save'] / 'datasets' / 'math'
df_problems = pd.read_parquet(dirname / 'OpenMathInstruct-2_train_1M.parquet')
instructions = "For each problem, explain your reasoning step by step and use LaTeX for all mathematical expressions. Indicate your final answer using \\boxed{...}."
num_examples = 50
num_train = 50000
num_validation = 5000

df_problems = df_problems[df_problems.answer.str.len() > 0]
df_problems = df_problems.drop_duplicates('problem')
df_problems = df_problems.sort_values('length').head(num_examples + num_train + num_validation).reset_index(drop=True)

In [14]:
model_names = [
    # 'meta-llama/Llama-3.2-3B-Instruct',
    'meta-llama/Llama-3.2-3B',
    'meta-llama/Llama-3.2-1B',
    'meta-llama/Llama-3.1-8B',
]
for model_name in tqdm(model_names):
    print(f"Tokenizing {model_name}")
    save_dir = directories['save'] / f'datasets/math/{model_name}'
    save_dir.mkdir(parents=True, exist_ok=True)
    parquet_name = f'prompts.parquet'
    if os.path.exists(save_dir / parquet_name):
        continue
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    df_prompts, text_tokens, loss_masks = create_dataset(tokenizer, 
                                                        instructions=instructions, 
                                                        questions=df_problems.problem.tolist(), 
                                                        targets=df_problems.solution.tolist(), 
                                                        example_set_size=num_examples)
    
    df_prompts.to_parquet(parquet_name)
    for split in ['train', 'validation']:
        save_path = save_dir / f'{split}.pt'
        split_mask = df_prompts.split == split
        dataset = {
            'text_tokens': text_tokens[split_mask],
            'loss_masks': loss_masks[split_mask],
        }
        torch.save(dataset, save_path)

  0%|          | 0/3 [00:00<?, ?it/s]

Tokenizing meta-llama/Llama-3.2-3B
Assigned splits: {'train': 49500, 'validation': 5500, 'example': 50}. Generating few-shot prompts.
Generated 55000 unique prompts. Tokenizing.
Tokenized 55000 prompts with min length 172 and max length 339.
Tokenizing meta-llama/Llama-3.2-1B
Assigned splits: {'train': 49500, 'validation': 5500, 'example': 50}. Generating few-shot prompts.
Generated 55000 unique prompts. Tokenizing.
Tokenized 55000 prompts with min length 172 and max length 339.
Tokenizing meta-llama/Llama-3.1-8B
Assigned splits: {'train': 49500, 'validation': 5500, 'example': 50}. Generating few-shot prompts.
Generated 55000 unique prompts. Tokenizing.
Tokenized 55000 prompts with min length 172 and max length 339.
