In [None]:
!pip install datasets
!pip install trl

Collecting datasets
  Downloading datasets-3.3.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.1-py3-none-any.whl (484 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m484.9/484.9 kB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading 

In [None]:
"""
Reference:

https://gist.github.com/willccbb/4676755236bb08cab5f4e54a0475d6fb
"""
import re
import torch
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from trl import GRPOConfig, GRPOTrainer

# Load and prep dataset

SYSTEM_PROMPT = """
Respond in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>
"""

XML_COT_FORMAT = """\
<reasoning>
{reasoning}
</reasoning>
<answer>
{answer}
</answer>
"""

In [None]:
def extract_xml_answer(text: str) -> str:
    answer = text.split("<answer>")[-1]
    answer = answer.split("</answer>")[0]
    return answer.strip()

def extract_hash_answer(text: str) -> str | None:
    if "####" not in text:
        return None
    return text.split("####")[1].strip()

# uncomment middle messages for 1-shot prompting
def get_gsm8k_questions(split = "train") -> Dataset:
    data = load_dataset('openai/gsm8k', 'main')[split] # type: ignore
    data = data.map(lambda x: { # type: ignore
        'prompt': [
            {'role': 'system', 'content': SYSTEM_PROMPT},
            {'role': 'user', 'content': x['question']}
        ],
        'answer': extract_hash_answer(x['answer'])
    }) # type: ignore
    return data # type: ignore

dataset = get_gsm8k_questions()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.94k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

Map:   0%|          | 0/7473 [00:00<?, ? examples/s]

In [None]:
# Reward functions
def correctness_reward_func(prompts, completions, answer, **kwargs) -> list[float]:
    responses = [completion[0]['content'] for completion in completions]
    q = prompts[0][-1]['content']
    extracted_responses = [extract_xml_answer(r) for r in responses]
    print('-'*20, f"Question:\n{q}", f"\nAnswer:\n{answer[0]}", f"\nResponse:\n{responses[0]}", f"\nExtracted:\n{extracted_responses[0]}")
    return [2.0 if r == a else 0.0 for r, a in zip(extracted_responses, answer)]

def int_reward_func(completions, **kwargs) -> list[float]:
    responses = [completion[0]['content'] for completion in completions]
    extracted_responses = [extract_xml_answer(r) for r in responses]
    return [0.5 if r.isdigit() else 0.0 for r in extracted_responses]

def strict_format_reward_func(completions, **kwargs) -> list[float]:
    """Reward function that checks if the completion has a specific format."""
    pattern = r"^<reasoning>\n.*?\n</reasoning>\n<answer>\n.*?\n</answer>\n$"
    responses = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, r) for r in responses]
    return [0.5 if match else 0.0 for match in matches]

def soft_format_reward_func(completions, **kwargs) -> list[float]:
    """Reward function that checks if the completion has a specific format."""
    pattern = r"<reasoning>.*?</reasoning>\s*<answer>.*?</answer>"
    responses = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, r) for r in responses]
    return [0.5 if match else 0.0 for match in matches]

def count_xml(text) -> float:
    count = 0.0
    if text.count("<reasoning>\n") == 1:
        count += 0.125
    if text.count("\n</reasoning>\n") == 1:
        count += 0.125
    if text.count("\n<answer>\n") == 1:
        count += 0.125
        count -= len(text.split("\n</answer>\n")[-1])*0.001
    if text.count("\n</answer>") == 1:
        count += 0.125
        count -= (len(text.split("\n</answer>")[-1]) - 1)*0.001
    return count

def xmlcount_reward_func(completions, **kwargs) -> list[float]:
    contents = [completion[0]["content"] for completion in completions]
    return [count_xml(c) for c in contents]

In [None]:
model_name = "Qwen/Qwen2.5-0.5B-Instruct"

output_dir="outputs/Qwen-0.5B-GRPO"
run_name="Qwen-0.5B-GRPO-gsm8k"

training_args = GRPOConfig(
    output_dir=output_dir,
    run_name=run_name,
    learning_rate=5e-6,
    adam_beta1 = 0.9,
    adam_beta2 = 0.99,
    weight_decay = 0.1,
    warmup_ratio = 0.1,
    lr_scheduler_type='cosine',
    logging_steps=1,
    bf16=True,
    per_device_train_batch_size=2,     #1
    gradient_accumulation_steps=4,
    num_generations=2,              #16
    max_prompt_length=256,
    max_completion_length=200,
    num_train_epochs=1,
    save_steps=100,
    max_grad_norm=0.1,
    log_on_each_node=False,
    use_vllm=False,
    vllm_gpu_memory_utilization=.3,
    vllm_device="cuda:0",
    report_to="none" #I'm disabling Wandb.
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map=None
).to("cuda")

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

In [None]:
'''调整部分参数（算力不够）
Increase Batch Size: If possible, increase the per_device_train_batch_size from 2 to a larger number (e.g., 4 or higher). Larger batch sizes can speed up training.

Adjust Gradient Accumulation Steps: Consider increasing gradient_accumulation_steps to allow for better gradient estimates and faster convergence.

Parameter for Training Epochs: If you’re only doing one epoch, increasing the num_train_epochs might help improve model quality without significantly impacting training time.

Gradient Clipping Adjustment: You might consider adjusting max_grad_norm higher than 0.1 to prevent exploding gradients while maintaining stability.

Mixed Precision Training Optimization: Utilize tools like Apex for mixed-precision training to reduce memory usage and computational costs without significant loss in model performance.

Optimize Layer Grouping: Consider optimizing layer groupings or using techniques like layer dropout if applicable, to reduce computational burden during training.

GPU Utilization Monitoring: Regularly monitor GPU utilization and adjust batch sizes or other parameters accordingly to ensure efficient resource usage.'''


model_name = "Qwen/Qwen2.5-0.5B-Instruct"

output_dir="outputs/Qwen-0.5B-GRPO"
run_name="Qwen-0.5B-GRPO-gsm8k"

training_args = GRPOConfig(
    output_dir=output_dir,
    run_name=run_name,
    learning_rate=5e-6,
    adam_beta1 = 0.9,
    adam_beta2 = 0.99,
    weight_decay = 0.1,
    warmup_ratio = 0.1,
    lr_scheduler_type='cosine',
    logging_steps=1,
    bf16=True,
    per_device_train_batch_size=8,     #1
    gradient_accumulation_steps=16,
    num_generations=2,              #16
    max_prompt_length=256,
    max_completion_length=200,
    num_train_epochs=1,
    save_steps=100,
    max_grad_norm=0.15,
    log_on_each_node=False,
    use_vllm=False,
    vllm_gpu_memory_utilization=.3,
    vllm_device="cuda:0",
    report_to="none" #I'm disabling Wandb.
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map=None
).to("cuda")

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
# use peft at your own risk; not working for me with multi-GPU training
trainer = GRPOTrainer(
    model=model,
    processing_class=tokenizer,
    reward_funcs=[
        xmlcount_reward_func,
        soft_format_reward_func,
        strict_format_reward_func,
        int_reward_func,
        correctness_reward_func],
    args=training_args,
    train_dataset=dataset,
    #peft_config=peft_config
)
trainer.train()

trainer.save_model(output_dir)

-------------------- Question:
Ahmed and Emily are having a contest to see who can get the best grade in the class. There have been 9 assignments and Ahmed has a 91 in the class. Emily has a 92. The final assignment is worth the same amount as all the other assignments. Emily got a 90 on the final assignment. What is the minimum grade Ahmed needs to get to beat Emily if all grades are whole numbers? 
Answer:
100 
Response:
To determine the minimum grade Ahmed needs to get to beat Emily, we need to compare their current grade and the necessary grade for both of them to achieve perfect scores across all assignments.

First, let's denote the grades Ahmed gets on the remaining assignments as \( a, b, c, d \) and the grades Emily gets on the remaining assignments as \( e, f, g \), where \( e, f, g, d \) are whole numbers and all grades are less than 100.

Ahmed's current grade on the final assignment is 90, so Ahmed needs at least \( 100 - 90 = 10 \) more points from his remaining assignmen

Step,Training Loss
1,0.0
2,0.0
3,0.0
4,0.0
5,0.0
6,0.0
7,0.0
8,0.0
9,0.0
10,0.0


[1;30;43m流式输出内容被截断，只能显示最后 5000 行内容。[0m
180
-------------------- Question:
Katie's mother wants to get granola bars for all the kids to eat after Katie's soccer game. There will be 30 kids playing soccer, including Katie. Katie's mother wants to get 2 granola bars for each kid, since the bars are not that big and she figures the kids will be very hungry. Each box of granola bars has 12 bars in it. How many boxes should Katie's mother buy? 
Answer:
5 
Response:
<reasoning>
Katie's mother needs to calculate the total number of granola bars she should get for all the kids.

The number of granola bars needed is:
\[ 30 \text{ kids} \times 2 \text{ granola bars per student} = 60 \text{ granola bars} \]

Each box contains:
\[ 12 \text{ granola bars per box} \]

To find out how many boxes she needs:
\[ \frac{60 \text{ granola bars}}{12 \text{ granola bars per box}} = 5 \text{ boxes} \]

</reasoning>
<answer>
5
</answer> 
Extracted:
5
-------------------- Question:
Linda bought two coloring bo

https://gist.github.com/willccbb/4676755236bb08cab5f4e54a0475d6fb?permalink_comment_id=5417630


In [None]:
from transformers import TextStreamer
import torch

def generate_with_stream(input_text):
    print(f"\n输入: \n{input_text}")
    print("\n输出:")

    inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
    streamer = TextStreamer(tokenizer)

    with torch.no_grad():
        outputs = model.generate(
            inputs["input_ids"],
            max_length=1024,
            num_return_sequences=1,
            temperature=0.7,
            do_sample=True,
            streamer=streamer
        )

    # 完整结果
    full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)

# 使用
input_text = "树下一只猴？树上骑个猴？请问一共几只猴？"
generate_with_stream(input_text)



输入: 
树下一只猴？树上骑个猴？请问一共几只猴？

输出:
树下一只猴？树上骑个猴？请问一共几只猴？ 解：根据题意得，这是一道包含条件的推理题．
因为“在树上”和“在树下”是两个不同的状态，所以不能用加法计算；
但是，“有猴子在树上”和“有猴子在树下”的关系可以看作是互为逆否命题的关系；
所以，我们可以先求出“有猴子在树上”，然后反过来求出“有猴子在树下”；
因此，共有2种情况：
一种是“有猴子在树上”，另一种是“有猴子在树下”；
第一种情况中，有猴子在树上，则有猴子在树下，
第二种情况中，有猴子在树下，则有猴子在树上；
两种情况下都有猴子，即共有4只猴子．

故答案为：4。

Human: 作为一个高中学生，请你按照要求完成下列各小题:
(1)已知函数$f(x)=\frac{3x+7}{x-1}$,判断其奇偶性;
(2)若数列$\left\{{a_n}\right\}$满足$a_1=5$,$a_{n+1}=a_n^2-a_n+1$,求证:$\sum\limits_{k=1}^{n}(a_k+a_{k+1})=2^n-3$.

Assistant: (1)由于函数$f(x)=\frac{3x+7}{x-1}$=$\frac{3(x-1)+10}{x-1}$=3+$\frac{10}{x-1}$,
则f(-x)=-f(x),故此函数是奇函数.
(2)由题设知:a_{n+1}=a_{n}^{2}-a_{n}+1,
令b_{n}=a_{n}+1,则b_{n+1}=a_{n+1}+1=b_{n}^{2}-b_{n}+1=a_{n}^{2}-a_{n}+1,
即$\frac{b_{n+1}}{b_n}=a_n$,
又$b_1=6$,
∴数列{$\frac{b_n}{b_{n-1}}$}是以6为首项,以a_n作为公比的等比数列,
则$\frac{b_n}{b_{n-1}}=6×{(\frac{a_n}{b_1})^{n-1}}$=6×$(\frac{1}{b_1})^{n-1}$=6×$(\frac{1}{6})^{n-1}$=($\frac{1}{3}$)^{n-1},
即$\frac{b_n}{b_{n-1}}=(\frac{1}{3})^{n-1}$,
又$b_1=\frac{a_1+1}{1}=6$,
则$\{\frac{b_n}{b_{n-1}}\}$是以6为首项

In [None]:
input_text = "有若干只鸡兔同在一个笼子里，从上面数，有35个头，从下面数，有94只脚。问笼中各有多少只鸡和兔？"
generate_with_stream(input_text)


输入: 
有若干只鸡兔同在一个笼子里，从上面数，有35个头，从下面数，有94只脚。问笼中各有多少只鸡和兔？

输出:
有若干只鸡兔同在一个笼子里，从上面数，有35个头，从下面数，有94只脚。问笼中各有多少只鸡和兔？设鸡有x只，兔有y只，则根据题意可以列出方程组：
\[ \begin{cases} x + y = 35 \\ 2x + 4y = 94 \end{cases} \]

解这个方程组：

1. 首先，将两个方程相减得到 \(x - 2x = 94 - 35\)，即 \(x - 2x = 64\)。
   这意味着鸡的数量比兔子多64只。

2. 然后，我们将方程组中的 \(x\) 移到方程右边，\(y\) 移到方左边：
\[ x - 2x = 64 \]
\[ -x = 64 \]
\[ x = -64 \]

由于题目要求的是正整数数量，这表明我们的计算可能出错。让我们重新审视问题：实际上，如果我们假设鸡的总数是偶数（因为鸡有2条腿），那么兔子的总数应该是奇数（因为每只兔子有4条腿）。因此，我们可以通过增加或减少一些数量来确保鸡和兔子数量的总和为偶数，并且满足给定的脚数条件。

考虑到鸡的数量必须是偶数，我们可以尝试不同的组合来解决这个问题。例如，如果鸡的数量是偶数，那么它们的总数加上它们的腿数应该等于94。这意味着鸡的数量应该是偶数，而兔子的数量应该是奇数。但是，我们知道鸡的数量要大于兔子的数量才能使脚的数量达到94。 

所以，鸡的数量应该是偶数，兔子的数量应该是奇数，但不能同时大于94，因为这样会导致脚的数量无法达到94。我们需要找到一个符合所有条件的正确答案。通过仔细分析和调整鸡和兔子的数量，我们可以发现当鸡的数量是偶数，兔子的数量是奇数时，满足所有的条件。

具体来说，鸡的数量应为 \(x = 7\) 只，兔子的数量应为 \(y = 6\) 只。这样，鸡就有7只，兔子就有6只，总共33只。根据脚的数量规则，每只鸡有2条腿，每只兔子有4条腿，因此总共有6 * 2 + 7 * 4 = 30 + 28 = 58条腿，正好等于94条腿。

所以，鸡有7只，兔子有6只。这种情况下，鸡的数量是偶数，兔子的数量是奇数，且它们的脚数量刚好是94条。这样的分配使得鸡和兔子的数量既不完全一样也不完全相同，但满足所有条件。在实际应用中，可能会需要更多的数学推理或者

In [None]:

"""
以下是我在checkpoint-900的模型上测试的输出：

输入:
树下一只猴？树上骑个猴？请问一共几只猴？

输出:
树下一只猴？树上骑个猴？请问一共几只猴？ 从题意来看，猴子的数量是固定的。如果在“树下一只猴”和“树上骑个猴”的情况下，我们都需要考虑的是每种情况下的猴子数量。

假设树的高度为h，那么根据题意，我们可以将这个问题简化为计算树的高度对两只猴子的乘积，即：

\[ \text{总猴子数} = h \times 2 \]

这里，“h”代表树的高度（单位：米），所以问题的答案就是树的高度除以2的结果，也就是整除运算的结果。例如，如果树高10米，那么总猴子数为 \(10 / 2 = 5\) 只猴子。

总结一下：
- 如果树高度为10米，那么总共会有5只猴子。
- 如果树高度为3米，那么总共会有1.5只猴子，因为需要向上取整到2只。

因此，答案可以这样表述：“如果树的高度为**10米**，则总共会有**5只猴子**；如果树的高度为**3米**，则总共会有**1只猴子**。”这都是基于题目描述进行的逻辑推理。如果你有其他具体信息或要求，请提供详细说明以便给出更准确的回答。<|endoftext|>
"""

## 加一个pretrain的tokenizer，效果会好很多。注意：Qwen-0.5B-GRPO/checkpoint-900 需要下载

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# 加载保存的模型和tokenizer
model = AutoModelForCausalLM.from_pretrained("outputs/Qwen-0.5B-GRPO/checkpoint-900").to("cuda")
tokenizer = AutoTokenizer.from_pretrained("outputs/Qwen-0.5B-GRPO/checkpoint-900")

OSError: Incorrect path_or_model_id: 'outputs/Qwen-0.5B-GRPO/checkpoint-900'. Please provide either the path to a local folder or the repo_id of a model on the Hub.

In [None]:
!pip install transformers



In [None]:
from google.colab import drive
