In [1]:
from unsloth import FastLanguageModel, is_bfloat16_supported
max_seq_length = 1024 # Can increase for longer reasoning traces
lora_rank = 64 # Larger rank = smarter, but slower

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "Qwen/Qwen2.5-1.5B-Instruct",
    max_seq_length = max_seq_length,
    load_in_4bit = False,
    fast_inference=True,
)

model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ], # Remove QKVO if out of memory
    lora_alpha = lora_rank,
    use_gradient_checkpointing = "unsloth", # Enable long context finetuning
    random_state = 3407,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 04-21 17:20:13 [__init__.py:239] Automatically detected platform cuda.


2025-04-21 17:20:13,601	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


==((====))==  Unsloth 2025.3.19: Fast Qwen2 patching. Transformers: 4.50.0.dev0. vLLM: 0.8.2.
   \\   /|    NVIDIA GeForce RTX 4070 Ti. Num GPUs = 1. Max memory: 11.994 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: vLLM loading unsloth/Qwen2.5-1.5B-Instruct with actual GPU utilization = 44.81%
Unsloth: Your GPU has CUDA compute capability 8.9 with VRAM = 11.99 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 1024. Num Sequences = 160.
Unsloth: vLLM's KV Cache can use up to 2.36 GB. Also swap space = 0 GB.
INFO 04-21 17:20:21 [config.py:585] This model supports multiple tasks: {'embed', 'generate', 'reward', 'classify', 'score'}. Defaulting to 'generate'.
INFO 04-21 17:20:21 [arg_utils.p

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:03<00:00,  3.94s/it]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:03<00:00,  3.94s/it]


INFO 04-21 17:20:27 [loader.py:447] Loading weights took 4.03 seconds
INFO 04-21 17:20:27 [punica_selector.py:18] Using PunicaWrapperGPU.





INFO 04-21 17:20:28 [model_runner.py:1146] Model loading took 3.0237 GB and 5.203210 seconds
INFO 04-21 17:20:30 [worker.py:267] Memory profiling takes 1.87 seconds
INFO 04-21 17:20:30 [worker.py:267] the current vLLM instance can use total_gpu_memory (11.99GiB) x gpu_memory_utilization (0.45) = 5.37GiB
INFO 04-21 17:20:30 [worker.py:267] model weights take 3.02GiB; non_torch_memory takes 0.04GiB; PyTorch activation peak memory takes 0.87GiB; the rest of the memory reserved for KV Cache is 1.43GiB.
INFO 04-21 17:20:30 [executor_base.py:111] # cuda blocks: 3351, # CPU blocks: 0
INFO 04-21 17:20:30 [executor_base.py:116] Maximum concurrency for 1024 tokens per request: 52.36x
INFO 04-21 17:20:30 [model_runner.py:1442] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing 

Capturing CUDA graph shapes: 100%|██████████| 23/23 [00:14<00:00,  1.58it/s]

INFO 04-21 17:20:45 [model_runner.py:1570] Graph capturing finished in 14 secs, took 0.21 GiB
INFO 04-21 17:20:45 [llm_engine.py:447] init engine (profile, create kv cache, warmup model) took 16.93 seconds



Unsloth 2025.3.19 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [2]:
# Cell 2: Data Prep (Modified for SFT)
import json
import os
from datasets import Dataset
from typing import Tuple, Optional, Union, Dict

# --- Helper Function (Keep if needed for formatting answer in template) ---
def format_solution_string(solution_dict: Dict[str, Union[int, float]]) -> str:
    """Formats a solution dictionary into a standardized string."""
    items = sorted(solution_dict.items())
    formatted_items = []
    for k, v in items:
        if isinstance(v, float) and v.is_integer():
            v = int(v)
        formatted_items.append(f"{k}={v}")
    return ", ".join(formatted_items)

# --- Modified Data Loading Function for SFT ---
def load_and_format_data_for_sft(json_path="./data.json", tokenizer=None) -> Dataset:
    """Loads data and formats it into a single 'text' field for SFT,
       using 'successful_llm_raw_response' as the target answer."""

    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)


    processed_data = []
    skipped_count = 0
    for entry in data:


        prompt_content = entry['prompt']
        # Use the raw LLM response directly as the target answer
        answer_content = entry['response']

        # Create the message format expected by the tokenizer's chat template
        messages = [
            {'role': 'user', 'content': prompt_content},
            {'role': 'assistant', 'content': answer_content} # Use the raw response here
        ]

        # Apply the chat template to create a single text string
        formatted_text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=False # We add the full conversation
        )
        processed_data.append({'text': formatted_text})
        

    # Create Hugging Face Dataset
    dataset = Dataset.from_dict({'text': [item['text'] for item in processed_data]})
    print(f"Loaded and formatted {len(dataset)} samples from {json_path} using 'successful_llm_raw_response' as target.")

    # Print a sample
    if len(dataset) > 0:
        print("\nSample formatted data point ('text' field):")
        print(dataset[0]['text'])

    return dataset

# --- Load the Dataset ---
dataset_path = "./data.json"
dataset = load_and_format_data_for_sft(dataset_path, tokenizer)

print(f"\nFinal dataset size after filtering: {len(dataset)}")
if len(dataset) == 0:
    raise ValueError("Dataset is empty after loading and filtering. Cannot train.")

# --- Split Dataset ---
# Consider a larger test set if possible, e.g., 100 or more
split_dataset = dataset.train_test_split(test_size=50, seed=42)
train_dataset = split_dataset['train']
eval_dataset = split_dataset['test'] # SFTTrainer can use this for perplexity eval

print(f"\nDataset split into:")
print(f"  Training set size: {len(train_dataset)}")
print(f"  Evaluation set size: {len(eval_dataset)}")


Loaded and formatted 1074 samples from ./data.json using 'successful_llm_raw_response' as target.

Sample formatted data point ('text' field):
<|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
<|im_start|>user
Solve the following system of linear equations for the variables x and y:
Equation 1: -21x -5y = 121
Equation 2: -22x + 28y = -538

Your response MUST be a single, valid JSON object.
Do NOT include any text or explanation before or after the JSON object.
Do NOT use markdown formatting like ```json ... ```.
The JSON object MUST conform exactly to the following structure:

{
  "reasoning": "A string containing the step-by-step derivation of the solution.",
  "solution": {
    "x": <numerical_value_for_x>,
    "y": <numerical_value_for_y>
  }
}

Replace <numerical_value_for_x> and <numerical_value_for_y> with the calculated numerical solutions for x and y, respectively. Ensure the "reasoning" field contains your step-by-step working as

<a name="Train"></a>
### Train the model

Now set up GRPO Trainer and all configurations!

In [None]:
from transformers import TrainingArguments

if 'bf16_supported' not in globals():
    bf16_supported = is_bfloat16_supported()

training_args = TrainingArguments(
    per_device_train_batch_size = 16, 
    gradient_accumulation_steps = 1, 
    warmup_ratio = 0.1, 
    num_train_epochs = 3, 
    learning_rate = 4e-4, 
    logging_steps = 5,    
    optim = "adamw_8bit", 
    save_strategy = "steps", 
    save_steps = 50,       
    eval_strategy = "steps", 
    eval_steps = 25,       
    bf16 = bf16_supported,
    fp16 = not bf16_supported,
    output_dir = "outputs_sft", 
    report_to = "none",
    seed = 3407,
)

print("\nTrainingArguments set for SFT.")
print(f"Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")



TrainingArguments set for SFT.
Effective batch size: 8


In [4]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = eval_dataset, 
    args = training_args,
    dataset_text_field = "text",        
    max_seq_length = max_seq_length,    
    dataset_num_proc = 2,               
    packing = False,
)


print("\nStarting SFT training...")
trainer.train()
print("\nSFT training finished.")

final_adapter_path = "sft_final_adapter"
trainer.save_model(final_adapter_path)
print(f"\nFinal SFT LoRA adapter saved to {final_adapter_path}")


Unsloth: Tokenizing ["text"] (num_proc=2): 100%|██████████| 1024/1024 [00:01<00:00, 990.68 examples/s]
Unsloth: Tokenizing ["text"] (num_proc=2): 100%|██████████| 50/50 [00:00<00:00, 90.28 examples/s]



Starting SFT training...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,024 | Num Epochs = 3 | Total steps = 384
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 1 x 1) = 8
 "-____-"     Trainable parameters = 73,859,072/1,617,573,376 (4.57% trained)


Step,Training Loss,Validation Loss


Unsloth: Will smartly offload gradients to save VRAM!


KeyboardInterrupt: 

In [None]:
import json
import re
import time
from tqdm import tqdm
from vllm import SamplingParams
from typing import Dict, Union, Optional, List

print("\n" + "="*30)
print("  Starting SFT Model Evaluation")
print("="*30 + "\n")

# --- Helper Functions ---
def format_solution_string(solution_dict: Optional[Dict[str, Union[int, float]]]) -> str:
    """Formats a solution dictionary into a standardized string 'k1=v1, k2=v2'."""
    if not solution_dict or not isinstance(solution_dict, dict):
        return "N/A (Invalid Solution Format)"
    try:
        items = sorted(solution_dict.items())
        formatted_items = []
        for k, v in items:
            # Attempt to convert to int if it's a whole number float
            try:
                if isinstance(v, (int, float)) and float(v).is_integer():
                    v = int(v)
                elif isinstance(v, (int, float)):
                    # Keep float precision if not integer
                    v = float(v)
                else:
                    # Handle non-numeric values if necessary, or raise error
                    v = str(v) # Basic fallback
            except (ValueError, TypeError):
                 v = str(v) # Fallback for unconvertible types
            formatted_items.append(f"{k}={v}")
        return ", ".join(formatted_items)
    except Exception as e:
        # print(f"Error formatting solution dict {solution_dict}: {e}") # Debugging
        return "N/A (Format Error)"

def normalize_and_parse_json(text: Optional[str]) -> Optional[Dict]:
    """Removes markdown fences, normalizes whitespace, and parses JSON."""
    if not text or not isinstance(text, str):
        return None
    # Remove markdown fences (more robustly)
    text = re.sub(r'^```(?:json)?\s*', '', text.strip(), flags=re.IGNORECASE | re.DOTALL)
    text = re.sub(r'\s*```$', '', text.strip(), flags=re.IGNORECASE | re.DOTALL)
    text = text.strip()
    try:
        # Handle potential escape sequences needed for JSON parsing
        # (This might need adjustment based on model output specifics)
        # text = text.encode('utf-8').decode('unicode_escape') # Example if needed
        return json.loads(text)
    except json.JSONDecodeError:
        # print(f"JSON Decode Error for text: {text[:100]}...") # Debugging
        return None # Indicate parsing failure

# --- 1. Ensure Model is in Eval Mode ---
model.eval()
print("Model set to evaluation mode.")

# --- 2. Prepare Evaluation Data ---
print(f"Preparing {len(eval_dataset)} evaluation prompts...")
dataset_path = "./data.json" # Make sure this path is correct
try:
    with open(dataset_path, 'r', encoding='utf-8') as f:
        original_data = json.load(f)
except Exception as e:
    print(f"Error reloading original data from {dataset_path}: {e}")
    raise

# Get indices from the eval split (handle potential missing column)
test_indices = split_dataset['test']['__index_level_0__'] if '__index_level_0__' in split_dataset['test'].column_names else list(range(len(eval_dataset)))

# Extract prompts and ground truths using indices
eval_prompts_content: List[str] = []
eval_answers_gt_raw: List[str] = []
eval_answers_gt_solution_str: List[str] = []
valid_original_data_map = {i: entry for i, entry in enumerate(original_data)} # Map original index to entry

skipped_indices = []
for idx in test_indices:
    entry = valid_original_data_map.get(idx)
    if (entry and 'prompt_sent' in entry and
        'successful_llm_raw_response' in entry and
        'actual_solution' in entry and
        isinstance(entry.get('actual_solution'), dict)):
        try:
            # Pre-format the ground truth solution string for comparison
            gt_solution_str = format_solution_string(entry['actual_solution'])
            if "N/A" not in gt_solution_str: # Basic check for valid GT format
                eval_prompts_content.append(entry['prompt_sent'])
                eval_answers_gt_raw.append(entry['successful_llm_raw_response'])
                eval_answers_gt_solution_str.append(gt_solution_str)
            else:
                skipped_indices.append(idx)
        except Exception as e:
            # print(f"Skipping index {idx} due to formatting error: {e}") # Debugging
            skipped_indices.append(idx)
    else:
        skipped_indices.append(idx)

if skipped_indices:
    print(f"Warning: Skipped {len(skipped_indices)} eval samples due to missing data or formatting errors in ground truth.")
if not eval_prompts_content:
     raise ValueError("No valid evaluation prompts could be prepared.")

print(f"Extracted {len(eval_prompts_content)} valid prompts and ground truth answers for evaluation.")

# --- 3. Format Prompts for Generation ---
eval_prompts_formatted = []
for p_content in eval_prompts_content:
    messages = [{'role': 'user', 'content': p_content}]
    # Apply template for generation (prompt model to respond)
    formatted_p = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    eval_prompts_formatted.append(formatted_p)
print("Evaluation prompts formatted for generation.")

# --- 4. Define Generation Parameters ---
# Adjust these parameters as needed for desired output behavior
eval_sampling_params = SamplingParams(
    temperature=1.0, 
    top_p=0.95,      
    top_k=64,  
    max_tokens=1024,  
)

# --- 5. Generate Model Outputs ---
print(f"\nGenerating responses for {len(eval_prompts_formatted)} evaluation samples...")
start_time_gen = time.time()
generated_outputs_text = []

# Assumes 'model' is already set up for fast_generate (e.g., via Unsloth/vLLM)
# If using HF generate, the loop structure is similar but uses model.generate()
outputs = model.fast_generate(
    eval_prompts_formatted,
    sampling_params=eval_sampling_params,
    use_tqdm=True
)

# Extract text from vLLM outputs
for output in outputs:
    # output format is typically RequestOutput(request_id, prompt, prompt_token_ids, outputs, finished, metrics, lora_request)
    # outputs is a list of CompletionOutput(index, text, token_ids, cumulative_logprob, logprobs, finish_reason, stop_reason)
    generated_text = output.outputs[0].text if output.outputs else ""
    generated_outputs_text.append(generated_text.strip())

end_time_gen = time.time()
print(f"Generation finished in {end_time_gen - start_time_gen:.2f} seconds.")

# --- 6. Compare Generated Solutions to Ground Truth ---
correct_count = 0
incorrect_samples = []
parse_errors = 0
format_errors = 0

print("\nCalculating accuracy (comparing parsed solutions)...")
start_time_eval = time.time()

for i, generated_text in enumerate(tqdm(generated_outputs_text, desc="Evaluating")):
    ground_truth_solution_str = eval_answers_gt_solution_str[i]

    # Parse the generated JSON
    generated_data = normalize_and_parse_json(generated_text)

    is_correct = False
    normalized_gen_solution = "N/A (Parse Error)" # Default if parsing fails

    if generated_data:
        # Extract and format the 'solution' part from the generated data
        generated_solution_dict = generated_data.get('solution')
        if isinstance(generated_solution_dict, dict):
            normalized_gen_solution = format_solution_string(generated_solution_dict)
            if "N/A (Format Error)" in normalized_gen_solution:
                 format_errors += 1
            # Compare the formatted solution strings
            is_correct = (normalized_gen_solution == ground_truth_solution_str)
        else:
            # Parsed JSON but 'solution' key is missing or not a dict
            normalized_gen_solution = "N/A (Missing/Invalid Solution Key)"
            format_errors += 1 # Count as a format error
    else:
        parse_errors += 1

    # Store incorrect samples for review
    if not is_correct:
        incorrect_samples.append({
            "index": i, # Index within the eval batch
            "original_data_index": test_indices[i], # Index in the original loaded data
            "prompt": eval_prompts_content[i][:200] + "...",
            "ground_truth_solution": ground_truth_solution_str,
            "generated_full_text": generated_text,
            "generated_parsed_solution": normalized_gen_solution,
        })
    else:
        correct_count += 1

end_time_eval = time.time()
print(f"Evaluation calculation finished in {end_time_eval - start_time_eval:.2f} seconds.")

# --- 7. Output Results ---
total_samples = len(eval_prompts_content)
accuracy = correct_count / total_samples if total_samples > 0 else 0

print("\n" + "="*30)
print("    SFT Evaluation Results (Solution Match)")
print("="*30)
print(f"Total valid evaluation samples:   {total_samples}")
print(f"Correct predictions (Solution):   {correct_count}")
print(f"Incorrect predictions:            {total_samples - correct_count}")
print(f"  - Generation Parse Errors:      {parse_errors}")
print(f"  - Solution Format/Key Errors:   {format_errors}")
print(f"Accuracy (Solution Match):        {accuracy:.4f} ({accuracy*100:.2f}%)")
print("="*30)

# Print some incorrect sample details
if incorrect_samples:
    print("\n--- Example Incorrect Samples ---")
    num_to_show = min(5, len(incorrect_samples))
    for k in range(num_to_show):
        sample = incorrect_samples[k]
        print(f"\nSample Eval Index: {sample['index']} (Original Index: {sample['original_data_index']})")
        print(f"  Ground Truth Solution: '{sample['ground_truth_solution']}'")
        print(f"  Generated Solution:    '{sample['generated_parsed_solution']}'")
        print(f"  Generated Full Text:\n'''\n{sample['generated_full_text']}\n'''")
    print("-" * 35)



  Starting SFT Model Evaluation

Model set to evaluation mode.
Preparing 50 evaluation prompts...
Extracted 50 valid prompts and ground truth answers for evaluation.
Evaluation prompts formatted for generation.

Generating responses for 50 evaluation samples...


Processed prompts: 100%|██████████| 50/50 [00:07<00:00,  6.92it/s, est. speed input: 1543.71 toks/s, output: 1806.78 toks/s]


Generation finished in 7.25 seconds.

Calculating accuracy (comparing parsed solutions)...


Evaluating: 100%|██████████| 50/50 [00:00<00:00, 48601.44it/s]

Evaluation calculation finished in 0.00 seconds.

    SFT Evaluation Results (Solution Match)
Total valid evaluation samples:   50
Correct predictions (Solution):   17
Incorrect predictions:            33
  - Generation Parse Errors:      2
  - Solution Format/Key Errors:   0
Accuracy (Solution Match):        0.3400 (34.00%)

--- Example Incorrect Samples ---

Sample Eval Index: 0 (Original Index: 0)
  Ground Truth Solution: 'x=-22, y=-21'
  Generated Solution:    'x=-17.8, y=33.8'
  Generated Full Text:
'''
{
  "reasoning": "To solve the system of linear equations, we can use the method of substitution or elimination. Here, we will use the elimination method. First, we will multiply Equation 2 by 20 to align the coefficients of y in both equations. This gives us: -340x + 20y = 5740. Now, we subtract Equation 1 from this new equation: (-340x + 20y) - (-26x + 20y) = 5740 - 152. Simplifying this, we get: -314x = 5588. Solving for x, we find x = -5588 / 314 = -17.8. Now, we substitute x =


