## SFT on GSM8k
This code provides a simplistic training recipe for running supervised fine-tuning on a Llama 3.2 3B model on GSM8k dataset.

In [24]:
### import relevant packages

from datasets import load_dataset
from transformers import AutoTokenizer, TrainingArguments, AutoModelForCausalLM
from datasets import Dataset

from trl import SFTTrainer, SFTConfig, DataCollatorForCompletionOnlyLM


In [3]:
## mention a cache dir if you want to save a local file
cache_dir='/scratch/gpfs/ap34/hf_models'

model_name = "/scratch/gpfs/ARORA/ap34/Llama-3.2-3B" ### change here for other models
model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=cache_dir)

tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)  ### load tokenizer
tokenizer.pad_token = tokenizer.eos_token

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [31]:
#load the relevant dataset and create a train/test split

data = load_dataset('openai/gsm8k', 'main', cache_dir=cache_dir)

train_data = data['train']
test_data = data['test']

def create_conversation(d):
    text = '### Question:\n' + str(d['question']) + '\n\n### Solution:\n' + str(d['answer'])
    return {'text': text}

conversation_train_data = []
for d in train_data:
    conversation_train_data += [create_conversation(d)]
conversation_train_data = Dataset.from_list(conversation_train_data)

Using the latest cached version of the dataset since openai/gsm8k couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'main' at /scratch/gpfs/ap34/hf_models/openai___gsm8k/main/0.0.0/e53f048856ff4f594e959d75785d2c2d37b678ee (last modified on Thu Apr 10 08:47:38 2025).


In [34]:
# define the training arguments
output_dir='result'

num_train_epochs=3
per_device_train_batch_size=1
gradient_accumulation_steps=64
gradient_checkpointing=False
optim = 'adamw_torch'

logging_steps=10
save_strategy='epoch'
bf16='True'

learning_rate=5e-5
weight_decay=0.0

max_sequence_length=512

warmup_ratio=0.03
lr_scheduler_type='cosine'


In [35]:
# Now, we define the arguments for the Trainer

# We first start with a data collator that computes loss only on the answer tokens
response_template = "### Solution:\n" 
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

# Next we define the configuration of the trainer with relevant arguments
args = SFTConfig(
    output_dir=output_dir,  # directory to save and repository id
    num_train_epochs=num_train_epochs,  # number of training epochs
    per_device_train_batch_size=per_device_train_batch_size,  # batch size per device during training
    gradient_accumulation_steps=gradient_accumulation_steps,  # number of steps before performing a backward/update pass
    gradient_checkpointing=gradient_checkpointing,  # use gradient checkpointing to save memory
    optim=optim,  # use fused adamw optimizer
    logging_steps=logging_steps,  # log every 10 steps
    save_strategy=save_strategy,  # save checkpoint every epoch
    bf16=bf16,  # use bfloat16 precision
    learning_rate=learning_rate,  # learning rate, based on QLoRA paper
    weight_decay=weight_decay,
    max_seq_length=max_sequence_length,
    warmup_ratio=warmup_ratio,  # warmup ratio based on QLoRA paper
    lr_scheduler_type=lr_scheduler_type,  # use constant learning rate scheduler
    report_to='tensorboard',
)

config = {}
config['model'] = model
config['processing_class'] = tokenizer
config['train_dataset'] = conversation_train_data

config['args'] = args
config['data_collator'] = collator


trainer = SFTTrainer(**config)

train_result = trainer.train()
metrics = train_result.metrics

metrics["train_samples"] = len(conversation_train_data)

trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()


trainer.save_model(output_dir)  # Saves model & weights

# Explicitly save the tokenizer
tokenizer.save_pretrained(output_dir)


Converting train dataset to ChatML:   0%|          | 0/7473 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/7473 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/7473 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/7473 [00:00<?, ? examples/s]

  super().__init__(
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss
10,0.5852
20,0.6016
30,0.593
40,0.5761
50,0.556
60,0.5569
70,0.5512
80,0.5411
90,0.5248
100,0.5485


***** train metrics *****
  total_flos               = 57671183GF
  train_loss               =     0.2833
  train_runtime            = 0:28:32.71
  train_samples            =       7473
  train_samples_per_second =      13.09
  train_steps_per_second   =      0.203


('result/tokenizer_config.json',
 'result/special_tokens_map.json',
 'result/tokenizer.json')

In [38]:
## Now evaluate by generating the responses
from transformers import GenerationConfig
import re
from sympy import sympify, Eq, simplify


generation_config = GenerationConfig(
    max_length=max_sequence_length,
    temperature=0.0,
    top_p=0.95,
    do_sample=False,
    pad_token_id=tokenizer.eos_token_id,
    eos_token_id=tokenizer.eos_token_id
)

def create_question(d):
    text = '### Question:\n' + str(d['question']) + '\n\n### Solution:\n' 
    return {'text': text, 'answer': d['answer']}

conversation_test_data = []
for d in test_data:
    conversation_test_data += [create_question(d)]
conversation_test_data =  Dataset.from_list(conversation_test_data)


def extract_last_number(text):
    numbers = re.findall(r"-?\d+(?:\.\d+)?", text)
    return numbers[-1] if numbers else None

def numerically_equal(a, b, tol=1e-6):
    try:
        a_expr = sympify(a, evaluate=True)
        b_expr = sympify(b, evaluate=True)
        diff = abs(float(a_expr) - float(b_expr))
        return diff <= tol
    except:
        return False
    
def is_correct(prediction, reference):
    return numerically_equal(extract_last_number(prediction), extract_last_number(reference))

# Generate and evaluate
correct = 0.
total = 0.

for d in conversation_test_data:
    prompt = d['text']
    answer = d['answer']
    true_answer = answer.split('####')[-1].strip()

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, generation_config=generation_config)
    response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[-1]:], skip_special_tokens=True)

    

    if is_correct(response, true_answer):
        correct += 1

    total += 1
    print ('Generated responses for:', total, 'Prediction Accuracy:', (1.*correct)/total)


print ('Prediction Accuracy:', (1.*correct)/total)






Generated responses for: 1.0 Prediction Accuracy: 0.0
Generated responses for: 2.0 Prediction Accuracy: 0.5
Generated responses for: 3.0 Prediction Accuracy: 0.3333333333333333
Generated responses for: 4.0 Prediction Accuracy: 0.5
Generated responses for: 5.0 Prediction Accuracy: 0.6
Generated responses for: 6.0 Prediction Accuracy: 0.5
Generated responses for: 7.0 Prediction Accuracy: 0.5714285714285714
Generated responses for: 8.0 Prediction Accuracy: 0.5
Generated responses for: 9.0 Prediction Accuracy: 0.4444444444444444
Generated responses for: 10.0 Prediction Accuracy: 0.4
Generated responses for: 11.0 Prediction Accuracy: 0.36363636363636365
Generated responses for: 12.0 Prediction Accuracy: 0.3333333333333333


KeyboardInterrupt: 

In [17]:
print (conversation_train_data['text'][0])

### Question:
Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?

### Solution:
Natalia sold 48/2 = <<48/2=24>>24 clips in May.
Natalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.
#### 72
