# Environment Setup

### Option 1: Use Google Colab

In [None]:
!pip install --upgrade pip
!pip install "unsloth@git+https://github.com/unslothai/unsloth.git@September-2025-v3"

In [None]:
from google.colab import drive
import sys
drive.mount('/content/drive')
sys.path.append('/content/drive/MyDrive/unsloth_env')

### Option 2: Use [NYUAD HPC](https://ood.hpc.abudhabi.nyu.edu/pun/sys/dashboard/)

In [None]:
!module avail gcc
!module avail g++
!module load gcc/9.2.0
!module show gcc/9.2.0

In [None]:
import os

gcc_bin = "/share/apps/NYUAD5/gcc/9.2.0/bin"
os.environ["CC"] = os.path.join(gcc_bin, "gcc")
os.environ["CXX"] = os.path.join(gcc_bin, "g++")
os.environ["PATH"] = f"{gcc_bin}:{os.environ.get('PATH', '')}"

print("CC =", os.environ["CC"])
print("CXX =", os.environ["CXX"])

### Check

In [None]:
import torch
print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("Current device:", torch.cuda.current_device())
print("Device name:", torch.cuda.get_device_name(torch.cuda.current_device()))
print("Number of GPUs:", torch.cuda.device_count())

# Model initialization

### Load an official model

In [None]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048  # Choose any sequence length
dtype = None  # This will auto-detect the best data type for your GPU
load_in_4bit = True  # Use 4-bit quantization to save memory

# Load the model and tokenizer from Hugging Face
# Note: We use the base model, not a 4-bit pre-quantized one,
# to ensure we start from the official weights.
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B", # Competition-approved model
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

# Data preparation

In [None]:
from datasets import load_dataset

# Load the full training dataset
full_dataset = load_dataset("ad6398/nyu-dl-teach-maths-comp", split="train")

# Shuffle the dataset for randomness and create our smaller splits
shuffled_dataset = full_dataset.shuffle(seed=42)
train_dataset = shuffled_dataset.select(range(200000))      # R1: 0~200k samples

## Masked tokens generalization

In [None]:
# @title
# Define the system prompt that sets the model's role and behavior
prompt_format = """You are a great mathematician and you are tasked with finding if a solution to a given maths question is correct or not. Your response should be 'true' if the solution is correct, otherwise 'false'. Below is the Question, Solution, and the Answer.
{}"""
completion_format = """
Solution:
{}
Answer:
{}
Output:
{}"""

# We must add an End Of Sequence (EOS) token to tell the model when a completion is finished.
EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_func(examples):
  questions = examples["question"]
  solutions = examples["solution"]
  answers = examples["answer"]
  outputs = examples["is_correct"]
  input_ids = []
  labels = []
  for question, solution, answer, output in zip(questions, solutions, answers, outputs):
    prompt_text = prompt_format.format(question)
    completion_text = completion_format.format(solution, str(answer), str(output))

    prompt_id = tokenizer.encode(prompt_text, add_special_tokens=False)
    input_id = tokenizer.encode(prompt_text + completion_text + EOS_TOKEN ,add_special_tokens=False)
    # omit the ones beyond the maximum length
    if len(input_id) > max_seq_length:
        continue

    label_id = [-100] * len(prompt_id) + input_id[len(prompt_id):]
    input_ids.append(input_id)
    labels.append(label_id)

  return {
      "input_ids": input_ids,
      "labels": labels
  }

formatted_train_dataset = train_dataset.map(formatting_prompts_func, batched=True, remove_columns=train_dataset.column_names)

In [None]:
# @title
# Assume formatted_train_dataset and tokenizer already exist

# 1. Access the first sample (index 0)
sample = formatted_train_dataset[0]

print("--- Viewing the first sample (index 0) properties ---")

# 2. Print all keys contained in the sample
print(f"Keys in the sample: {list(sample.keys())}")

# 3. Get and print properties of 'input_ids'
if 'input_ids' in sample:
    input_ids_length = len(sample['input_ids'])
    print(f"\nAttribute 'input_ids':")
    print(f"  - Total length: {input_ids_length}")

    # --- New decoding feature ---
    try:
        print("\n  [Decoding Input IDs] (full sequence seen by the model):")
        # skip_special_tokens=False allows us to see EOS tokens like </s>
        full_input_text = tokenizer.decode(sample['input_ids'], skip_special_tokens=False)
        print(f"    '{full_input_text}'")
    except NameError:
        print("  [Decoding failed] 'tokenizer' variable is not defined.")
    # --- End of decoding ---

else:
    print("\nError: 'input_ids' not found in the sample")
    input_ids_length = 0  # default for later length check

# 4. Get and print properties of 'labels'
if 'labels' in sample:
    labels_total_length = len(sample['labels'])
    print(f"\nAttribute 'labels':")
    print(f"  - Total length: {labels_total_length}")

    # 5. Check if input and label lengths match (they must)
    if input_ids_length > 0 and input_ids_length == labels_total_length:
        print("  (✓ Length matches 'input_ids', which is correct)")
    elif input_ids_length == 0:
        print("  (X Warning: 'input_ids' not found, cannot compare lengths)")
    else:
        print("  (X Warning: 'labels' and 'input_ids' lengths do not match, training may fail!)")

    # 6. Calculate the length of effective labels (tokens not equal to -100)
    # This tells us how many tokens the model actually needs to predict
    effective_labels = [token for token in sample['labels'] if token != -100]
    effective_label_length = len(effective_labels)

    print(f"  - Effective label length (tokens model must predict): {effective_label_length}")

    # 7. Calculate prompt length (tokens masked with -100)
    prompt_length = labels_total_length - effective_label_length
    print(f"  - Prompt length (tokens masked with -100): {prompt_length}")

    # --- New decoding feature ---
    try:
        print("\n  [Decoding Labels] (portion the model is trained to predict):")
        # Decode only the effective labels (filtered from -100)
        effective_label_text = tokenizer.decode(effective_labels, skip_special_tokens=False)
        print(f"    '{effective_label_text}'")
    except NameError:
        print("  [Decoding failed] 'tokenizer' variable is not defined.")
    # --- End of decoding ---

else:
    print("\nError: 'labels' not found in the sample")

print("\n--- Check complete ---")


# Training settings

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
output_path = "/scratch/yl11109/trained_models_masked_R1" # this is the model-saving path, where the model checkpoint is saved
model = FastLanguageModel.get_peft_model(
    model,
    r = 128, # A huge rank since math problems takes a complicated model.
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 2*128, # A common practice is to set alpha = 2 * r
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 42,
)
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    completion_only_loss = True,
    train_dataset = formatted_train_dataset,
    dataset_num_proc=2,
    # dataset_text_field = "text",
    max_seq_length = max_seq_length,
    args = TrainingArguments(
        per_device_train_batch_size = 8, #
        gradient_accumulation_steps = 8,#  total 64 samples in one batch
        warmup_steps = 5,
        max_steps = 3125, # 8*8*3125=200k samples
        learning_rate = 2e-5, #2e-5 would be a descent value. 1e-5 too small
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "cosine",
        seed = 42,
        report_to = "none",
        output_dir = output_path,
        save_strategy = "steps",
        save_steps = 100,
        ddp_find_unused_parameters = False,
    ),
)

# Training

In [None]:
FastLanguageModel.for_training(model)
trainer.train()