<a href="https://colab.research.google.com/github/ayyucedemirbas/Group-Relative-Policy-Optimization/blob/main/GRPO_without_flash_attention.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -qqq datasets==3.2.0 transformers==4.47.1 trl==0.14.0 peft==0.14.0 accelerate==1.2.1 bitsandbytes==0.45.2 wandb==0.19.7 --progress-bar off
!pip install -qqq flash-attn --no-build-isolation --progress-bar off

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.9.0 which is incompatible.[0m[31m
[0m  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for flash-attn (setup.py) ... [?25l[?25hdone


In [None]:
!huggingface-cli login

In [2]:
import torch
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import GRPOConfig, GRPOTrainer

In [3]:
dataset = load_dataset("mlabonne/smoltldr")
print(dataset)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    train: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 2000
    })
    validation: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 200
    })
    test: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 200
    })
})


In [4]:
model_id = "HuggingFaceTB/SmolLM-135M-Instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype="auto",
    device_map="auto",
    #attn_implementation="flash_attention_2",
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [5]:
lora_config = LoraConfig(
    task_type="CAUSAL_LM",
    r=16,
    lora_alpha=32,
    target_modules="all-linear",
)
model = get_peft_model(model, lora_config)
print(model.print_trainable_parameters())

trainable params: 4,884,480 || all params: 139,399,488 || trainable%: 3.5039
None


In [6]:
ideal_length = 50


def reward_len(completions, **kwargs):
    return [-abs(ideal_length - len(completion)) for completion in completions]

In [7]:
training_args = GRPOConfig(
    output_dir="GRPO",
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=2,
    max_prompt_length=512,
    max_completion_length=96,
    num_generations=8,
    optim="adamw_8bit",
    num_train_epochs=1,
    bf16=True,
    report_to="none",
    remove_unused_columns=False,
    logging_steps=1,
)

In [8]:
trainer = GRPOTrainer(
    model=model,
    reward_funcs=[reward_len],
    args=training_args,
    train_dataset=dataset["train"],
)

In [9]:
trainer.train()

Step,Training Loss
1,-0.0
2,0.0001
3,0.0
4,0.0
5,0.0
6,0.0001
7,0.0
8,0.0001
9,0.0
10,0.0


TrainOutput(global_step=1000, training_loss=0.024692416092380882, metrics={'train_runtime': 8694.6182, 'train_samples_per_second': 0.23, 'train_steps_per_second': 0.115, 'total_flos': 0.0, 'train_loss': 0.024692416092380882})

In [10]:
merged_model = trainer.model.merge_and_unload()
merged_model.push_to_hub(
     "ayyuce/SmolGRPO-135M",
    commit_message="GRPO"
)

README.md:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/269M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ayyuce/SmolGRPO-135M/commit/ed1190b9116c6766ada65ae0b2410a6e9ab40674', commit_message='GRPO', commit_description='', oid='ed1190b9116c6766ada65ae0b2410a6e9ab40674', pr_url=None, repo_url=RepoUrl('https://huggingface.co/ayyuce/SmolGRPO-135M', endpoint='https://huggingface.co', repo_type='model', repo_id='ayyuce/SmolGRPO-135M'), pr_revision=None, pr_num=None)

In [None]:
prompt = """

"""

messages = [
    {"role": "user", "content": prompt},
]

In [None]:
from transformers import pipeline

generator = pipeline("text-generation", model="SmolGRPO-135M")

generate_kwargs = {
    "max_new_tokens": 256,
    "do_sample": True,
    "temperature": 0.5,
    "min_p": 0.1,
}

generated_text = generator(messages, generate_kwargs=generate_kwargs)

print(generated_text)