# Mini Deepseek R1
Here we're trying to implement [Mini Deepseek R1](https://www.philschmid.de/mini-deepseek-r1) using Modal.


References:
- [Mini Deepseek R1](https://www.philschmid.de/mini-deepseek-r1)
- [How to Launch a Jupyter Notebook on Modal Programmatically](https://modal.com/blog/how_to_launch_a_jupyter_notebook_on_modal_programmatically_article)
- [Modal Examples: Basic Notebook](https://github.com/modal-labs/modal-examples/blob/main/11_notebooks/basic.ipynb)

IMPORTANT: Jupyter notebook local python MUST MATCH the python version in the Modal image.

Create a new venv for 3.11:
```
python3.11 -m venv venv
source venv/bin/activate
```

Copied the Modal image [here](https://github.com/modal-labs/modal-client/blob/d924202767ef313ba67d60451975432e09e7b760/modal/image.py#L962C17-L962C55).

In [1]:
%pip install transformers datasets huggingface_hub jinja2 modal
from IPython.display import clear_output
clear_output()

In [15]:
from transformers import AutoTokenizer
from datasets import load_dataset

print("Loading dataset from the Hugging Face Hub...")
dataset_id = "Jiayi-Pan/Countdown-Tasks-3to4"
dataset = load_dataset(dataset_id, split="train")

# Select a random subset of 50k samples
dataset = dataset.shuffle(seed=42).select(range(50000))
print("Dataset loaded. Sample count:", len(dataset))

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-3B-Instruct")

def generate_r1_prompt(example):
    r1_prefix = [
      {
        "role": "system",
        "content": "You are a helpful assistant. You first thinks about the reasoning process in your mind and then provides the user with the answer."
      },
      { 
        "role": "user",
        "content": f"Using the numbers {example['nums']}, create an equation that equals {example['target']}. You can use basic arithmetic operations (+, -, *, /) and each number can only be used once. Show your work in <think> </think> tags. And return the final equation and answer in <answer> </answer> tags, for example <answer> (1 + 2) / 3 = 1 </answer>."
      },
      {
        "role": "assistant",
        "content": "Let me solve this step by step.\n<think>"
      }
    ]
    prompt = tokenizer.apply_chat_template(r1_prefix, tokenize=False, continue_final_message=True)
    return {"prompt": prompt, "target": example["target"], "nums": example["nums"]}

dataset = dataset.map(generate_r1_prompt)

# Split dataset into train and test sets.
split_dataset = dataset.train_test_split(test_size=0.1)
train_dataset = split_dataset["train"]
test_dataset = split_dataset["test"]

print("Data preprocessing complete. Train size:", len(train_dataset), "Test size:", len(test_dataset))

Loading dataset from the Hugging Face Hub...
Dataset loaded. Sample count: 50000
Data preprocessing complete. Train size: 45000 Test size: 5000


In [21]:
with open("data/train_dataset.json", "w") as f:
    json.dump(train_dataset.to_dict(), f)
with open("data/test_dataset.json", "w") as f:
    json.dump(train_dataset.to_dict(), f)

In [8]:
import re

def format_reward_func(completions, target, **kwargs):
    """
    Checks for proper format: <think>...</think>\n<answer>...</answer>
    """
    rewards = []
    for completion, gt in zip(completions, target):
        try:
            # Add synthetic <think> tag as it prefaces the assistant's output.
            completion = "<think>" + completion  
            regex = r"^<think>([^<]*(?:<(?!/?think>)[^<]*)*)<\/think>\n<answer>([\s\S]*?)<\/answer>$"
            match = re.search(regex, completion, re.DOTALL)
            if match is None or len(match.groups()) != 2:
                rewards.append(0.0)
            else:
                rewards.append(1.0)
        except Exception:
            rewards.append(0.0)
    return rewards

def equation_reward_func(completions, target, nums, **kwargs):
    """
    Evaluates correctness of the equation in the output.
    """
    rewards = []
    for completion, gt, numbers in zip(completions, target, nums):
        try:
            completion = "<think>" + completion
            match = re.search(r"<answer>(.*?)<\/answer>", completion)
            if match is None:
                rewards.append(0.0)
                continue
            equation = match.group(1).strip()
            # Extract and compare used numbers.
            used_numbers = [int(n) for n in re.findall(r'\d+', equation)]
            if sorted(used_numbers) != sorted(numbers):
                rewards.append(0.0)
                continue
            allowed_pattern = r'^[\d+\-*/().\s]+$'
            if not re.match(allowed_pattern, equation):
                rewards.append(0.0)
                continue
            result = eval(equation, {"__builtins__": None}, {})
            if abs(float(result) - float(gt)) < 1e-5:
                rewards.append(1.0)
            else:
                rewards.append(0.0)
        except Exception:
            rewards.append(0.0)
    return rewards

In [11]:
# (Optional) Run quick tests on the reward functions here if desired.
correct_sample_1 = """We need to find an equation using the numbers 19, 36, 55, and 7
exactly once, with basic arithmetic operations, that equals 65. One possible
combination is 55 + 36 - 19 + 7... </think>
<answer> 55 + 36 - 7 - 19 </answer>"""
 
correct_sample_2 = """ ... </think>
<answer> 55 + 36 - 7 - 19 </answer>"""
 
wrong_format = """User: Using the numbers [19, 36, 55, 7], create an equation that equals 65."""
 
wrong_format_2 = """To find the equation that equals 79 using the numbers 95, 78, 6, 88, I'll start by adding 88 and 95:                      
95 + 88 = 183                                                                                                              
Now, let's subtract 104 from 183 to get 79:
183 - 104 = 79
<think> 183 - 104 = 79 </think><think> 183 - 104 = 79 </think><answer> 183 - 104 = 79 </answer>"""
 
wrong_result = """ ... </think>
<answer> 55 + 36 - 7 - 18 </answer>"""
 
 
test_rewards = format_reward_func(completions=[correct_sample_1, correct_sample_2, wrong_format, wrong_format_2, wrong_result], target=["65", "65", "65", "65", "65"], nums=[[19, 36, 55, 7]] * 5)
assert test_rewards == [1.0, 1.0, 0.0, 0.0, 1.0], "Reward function is not working"
test_rewards = equation_reward_func(completions=[correct_sample_1, correct_sample_2, wrong_format, wrong_format_2, wrong_result], target=["65", "65", "65", "65", "65"], nums=[[19, 36, 55, 7]] * 5)
assert test_rewards == [1.0, 1.0, 0.0, 0.0, 0.0], "Reward function is not working"


# Train with Modal
- [Modal](https://modal.com/): [Docs](https://modal.com/docs)  |  [Logs](https://modal.com/logs)  |  [Github](https://github.com/modal-labs/modal-client)  |  [Notebook Example](https://modal.com/docs/guide/notebooks)
- Follow [this guide](https://modal.com/docs/guide) to create and connect an account on Modal
- In [Modal Secrets](https://modal.com/secrets), add a HuggingFace secret (HF_SECRET) with your HuggingFace token (HF_TOKEN)

In [None]:
import modal

app = modal.App()

image = (  
    modal.Image.from_registry("nvidia/cuda:12.2.2-cudnn8-devel-ubuntu22.04", add_python="3.11")
    .apt_install("git", "cmake", "clang", "build-essential", "libgomp1")
    .run_commands(
        "git config --global credential.helper store",
        "git config --global --add safe.directory '*'"
    )
    # .entrypoint([])
    .pip_install(
        "torch==2.5.1", 
        "tensorboard", 
        "setuptools<71.0.0",
        "huggingface_hub",
        "trl==0.14.0",
        "bitsandbytes",
        "ninja",
        "peft",
        "transformers==4.48.1",
        "accelerate==1.3.0",
        "hf-transfer==0.1.9",
        "deepspeed==0.15.4",
        "vllm==0.7.0",
        "datasets",
    )
    .pip_install("flash-attn==2.5.8") #, extra_options="--no-build-isolation")
    .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
    .add_local_dir("data", remote_path="/mnt/data")
)

@app.function(image=image, 
              gpu="A100", 
              secrets=[modal.Secret.from_name("HF_SECRET")],
              timeout=20 * 60 * 60,  # 20 hours
              )
def train_model():
    # Import modules inside the function so they load from the Modal image.
    import os
    from huggingface_hub import login
    from datasets import Dataset
    login(token=os.environ["HF_TOKEN"], add_to_git_credential=True)
    print("Logged into HF Hub in remote training function.")
    
    import json
    # Import TRL and related modules inside the function.
    from trl import GRPOConfig, GRPOTrainer, get_peft_config, ModelConfig

    # Load the datasets from the mounted directory.
    with open("/mnt/data/train_dataset.json") as f:
        train_dataset = Dataset.from_dict(json.load(f))
    with open("/mnt/data/test_dataset.json") as f:
        test_dataset = Dataset.from_dict(json.load(f))
    print("Loaded datasets.")
    # Define the model's configuration.
    model_config = ModelConfig(
        model_name_or_path="Qwen/Qwen2.5-3B-Instruct",
        torch_dtype="bfloat16",
        attn_implementation="flash_attention_2",
        use_peft=True,
        load_in_4bit=True,
    )
    print("Build model config.")
    # Specify GRPO training hyperparameters.
    training_args = GRPOConfig(
        output_dir="qwen-r1-aha-moment",
        learning_rate=5e-7,
        lr_scheduler_type="cosine",
        logging_steps=10,
        max_steps=100,  # Example: Increase for real training.
        per_device_train_batch_size=1,
        gradient_accumulation_steps=1,
        gradient_checkpointing=True,
        gradient_checkpointing_kwargs={"use_reentrant": False},
        bf16=True,
        max_prompt_length=256,
        max_completion_length=1024,
        num_generations=2,
        beta=0.001,
    )

    # Initialize the GRPO trainer.
    trainer = GRPOTrainer(
        model=model_config.model_name_or_path,
        reward_funcs=[format_reward_func, equation_reward_func],
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        peft_config=get_peft_config(model_config),
    )

    print("Starting training on remote GPU...")
    trainer.train()
    trainer.save_model(training_args.output_dir)
    print("Training complete; model saved to:", training_args.output_dir)
    return "Training complete!"

with app.run():
    result = train_model.remote()
    print(result)
