### Installation

In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth vllm
else:
    # [NOTE] Do the below ONLY in Colab! Use [[pip install unsloth vllm]]
    !pip install --no-deps unsloth vllm
    # ==0.7.3

In [None]:
#@title Colab Extra Install { display-mode: "form" }
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth vllm
else:
    !pip install --no-deps unsloth vllm==0.7.3
    # [NOTE] Do the below ONLY in Colab! Use [[pip install unsloth vllm]]
    # Skip restarting message in Colab
    import sys, re, requests; modules = list(sys.modules.keys())
    for x in modules: sys.modules.pop(x) if "PIL" in x or "google" in x else None
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft "trl==0.15.2" triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer

    # vLLM requirements - vLLM breaks Colab due to reinstalling numpy
    f = requests.get("https://raw.githubusercontent.com/vllm-project/vllm/refs/heads/main/requirements/common.txt").content
    with open("vllm_requirements.txt", "wb") as file:
        file.write(re.sub(rb"(transformers|numpy|xformers)[^\n]{1,}\n", b"", f))
    !pip install -r vllm_requirements.txt

# **Model**

Load up `DeepSeek-R1-Distill-Qwen1.5B`, and set parameters

In [None]:
from unsloth import FastLanguageModel, is_bfloat16_supported
import torch
max_seq_length = 16384 # Can increase for longer reasoning traces
lora_rank = 64 # Larger rank = smarter, but slower

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/DeepSeek-R1-Distill-Qwen-1.5B",
    max_seq_length = max_seq_length,
    load_in_4bit = True, # False for LoRA 16bit
    fast_inference = True, # Enable vLLM fast inference
    max_lora_rank = lora_rank,
    gpu_memory_utilization = 0.5,# Reduce if out of memory
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 05-09 15:17:44 __init__.py:207] Automatically detected platform cuda.
Unsloth: Switching from Unsloth dynamic quant to normal quant since
we do not yet support fast inference for unsloth/deepseek-r1-distill-qwen-1.5b-unsloth-bnb-4bit
==((====))==  Unsloth 2025.4.7: Fast Qwen2 patching. Transformers: 4.51.3. vLLM: 0.7.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Unsloth: vLLM loading unsloth/deepseek-r1-distill-qwen-1.5b-bnb-4bit with actual GPU utilization = 49.53%
Unsloth: Your GPU has CUDA compute capability 7.5 with VRAM = 14.74 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 16384. Num Sequences = 192.
Unsloth: vLLM's KV Cache can use up to 5.53 GB. Also swap space = 2 GB.


## Evaluation

In [None]:
from datasets import load_dataset, Dataset

In [None]:
aime24 = load_dataset("simplescaling/aime24_nofigures")['train']
aime24

In [None]:
from vllm import SamplingParams
sampling_params = SamplingParams(
    temperature = 0.0,
    max_tokens = max_seq_length,
)

In [None]:
example = aime24[7]
prompt = tokenizer.apply_chat_template([
        {"role" : "user", "content" : example['problem']},
    ], tokenize = False, add_generation_prompt = True)
print(prompt)
output = model.fast_generate(
    [prompt],
    sampling_params = sampling_params,
)[0].outputs[0].text
print(output)

# Run Till this cell

In [None]:

i = 0
pretrained_outputs = []
for example in aime24:
    print(f"processing {i}-th example")
    text = tokenizer.apply_chat_template([
        {"role" : "user", "content" : example['problem']},
    ], tokenize = False, add_generation_prompt = True)

    output = model.fast_generate(
      [text],
      sampling_params = sampling_params,
      lora_request = None,
      )[0].outputs[0].text
    pretrained_outputs.append(output)
    i +=1

In [None]:
import re

def extract_boxed_value(text):
    # Use regular expression to find the value inside \boxed{}
    pattern = r'\\boxed{([^}]*)}'
    match = re.search(pattern, text)

    if match:
        return match.group(1)
    else:
        return None

print(f"The answer is: {boxed_value}")

pretrained_results = []
for x,y in zip(aime24, pretrained_outputs):
  x['prediction'] = y
  x['pred_ans'] = extract_boxed_value(y)
  pretrained_results.append(x)

for t in pretrained_results:
  print(t['answer'], t['pred_ans'])

### Training Data Prep
<a name="Data"></a>

In [None]:
# SYSTEM_PROMPT = """
# Respond in the following format:
# <reasoning>
# ...
# </reasoning>
# <answer>
# ...
# </answer>
# """


In [None]:
# i = 0
# outputs = []
# for example in aime24:
#     print(f"processing {i}-th example")
#     text = tokenizer.apply_chat_template([
#         {'role': 'system', 'content': SYSTEM_PROMPT},
#         {"role" : "user", "content" : example['problem']},
#     ], tokenize = False, add_generation_prompt = True)

#     output = model.fast_generate(
#       [text],
#       sampling_params = sampling_params,
#       lora_request = None,
#       )[0].outputs[0].text
#     outputs.append(output)
#     i +=1