In [1]:

# Verify torch installation
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")
    print(f"CUDA version: {torch.version.cuda}")

PyTorch version: 2.6.0+cu124
CUDA available: True
CUDA device: Tesla T4
CUDA version: 12.4


In [2]:
import os
import sys
import random
import numpy as np
import torch
from datasets import load_dataset, Dataset
from huggingface_hub import login
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model
from trl import GRPOConfig, GRPOTrainer
import planetarium
from tqdm.auto import tqdm

In [3]:
# Set seeds for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)


In [4]:

# Get Hugging Face token
from getpass import getpass

# Use this cell to enter your HF token if not stored in Colab secrets
HF_TOKEN = getpass("Enter your Hugging Face token (or press Enter to skip if you've set it in Colab secrets): ")

# Try to get from Colab secrets if needed
if not HF_TOKEN:
    try:
        from google.colab import userdata
        HF_TOKEN = userdata.get('HF_TOKEN')
        print("Successfully retrieved token from Colab secrets")
    except:
        print("No token provided and couldn't find one in Colab secrets")
        print("You may need to restart this notebook and provide a token")

# Log in to Hugging Face Hub
if HF_TOKEN:
    login(token=HF_TOKEN)
    print("Logged in to Hugging Face Hub")
else:
    print("Warning: No Hugging Face token provided. You may face authentication issues.")


Enter your Hugging Face token (or press Enter to skip if you've set it in Colab secrets): ··········
Logged in to Hugging Face Hub


In [5]:

# Load the Planetarium dataset
print("Loading Planetarium dataset...")
dataset = load_dataset("BatsResearch/planetarium")
print(f"Dataset loaded with {len(dataset['train'])} training examples and {len(dataset['test'])} test examples")

# Check dataset structure
print("\nDataset structure:")
print(dataset)


Loading Planetarium dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Dataset loaded with 129975 training examples and 15943 test examples

Dataset structure:
DatasetDict({
    train: Dataset({
        features: ['id', 'name', 'domain', 'init', 'goal', 'num_objects', 'problem_pddl', 'natural_language', 'init_is_abstract', 'init_num_propositions', 'goal_is_abstract', 'goal_num_propositions', 'is_placeholder'],
        num_rows: 129975
    })
    test: Dataset({
        features: ['id', 'name', 'domain', 'init', 'goal', 'num_objects', 'problem_pddl', 'natural_language', 'init_is_abstract', 'init_num_propositions', 'goal_is_abstract', 'goal_num_propositions', 'is_placeholder'],
        num_rows: 15943
    })
})


In [6]:

# Shuffle the training dataset and split it
train_data = dataset["train"].shuffle(seed=SEED)
test_data = dataset["test"]

# Calculate split sizes (20% for SFT, 80% for GRPO)
total_train_size = len(train_data)
sft_size = int(0.2 * total_train_size)
grpo_size = total_train_size - sft_size

# Split the dataset
sft_data = train_data.select(range(sft_size))
grpo_data = train_data.select(range(sft_size, total_train_size))

print(f"\nData splitting complete:")
print(f"SFT: {len(sft_data)} samples (20% of training data)")
print(f"GRPO: {len(grpo_data)} samples (80% of training data)")
print(f"Test: {len(test_data)} samples")



Data splitting complete:
SFT: 25995 samples (20% of training data)
GRPO: 103980 samples (80% of training data)
Test: 15943 samples


In [7]:

# @title Load and Configure Model for SFT

# Model configuration
model_name = "google/gemma-2-2b-it"  # Using 2B instruct model
output_dir = "./sft_model"  # Directory to save the SFT model

# Create the output directory
os.makedirs(output_dir, exist_ok=True)

# Load the model and tokenizer
print(f"Loading model: {model_name}")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Ensure tokenizer has padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Configure LoRA
lora_config = LoraConfig(
    r=16,                 # LoRA rank
    lora_alpha=32,        # Scaling factor
    lora_dropout=0.05,    # LoRA dropout
    bias="none",          # No bias
    task_type="CAUSAL_LM",  # Causal language modeling
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]  # Target attention modules
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)
print(f"Model prepared with LoRA. Trainable parameters: {model.print_trainable_parameters()}")

def prepare_prompt(example):
    """Create a prompt from a natural language planning description."""

    prompt = f"""Transform the following natural language description into a PDDL problem:

{example["natural_language"]}

The PDDL problem:
"""
    return prompt


# Prepare SFT dataset
def prepare_sft_dataset(dataset):
    """Prepare dataset for SFT training."""

    prompts = []
    responses = []

    for example in dataset:
        prompt = prepare_prompt(example)
        response = example['problem_pddl']

        prompts.append(prompt)
        responses.append(response)

    # Create a formatted dataset for training
    formatted_dataset = []
    for prompt, response in zip(prompts, responses):
        formatted_dataset.append({
            "input": prompt,
            "output": response
        })

    return Dataset.from_list(formatted_dataset)

# Prepare SFT dataset
sft_dataset = prepare_sft_dataset(sft_data)
print(f"SFT dataset prepared with {len(sft_dataset)} examples")

Loading model: google/gemma-2-2b-it


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

trainable params: 6,389,760 || all params: 2,620,731,648 || trainable%: 0.2438
Model prepared with LoRA. Trainable parameters: None
SFT dataset prepared with 25995 examples


In [9]:


# Training arguments for SFT
sft_training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=1,      # Batch size of 1 (from the YAML config)
    gradient_accumulation_steps=4,      # Effective batch size of 4
    learning_rate=2e-5,                 # Learning rate (from the YAML config)
    num_train_epochs=1,                 # Number of epochs (from the YAML config)
    logging_steps=50,                   # Log every 50 steps
    save_strategy="epoch",              # Save at the end of each epoch
    save_total_limit=2,                 # Keep only the 2 most recent checkpoints
    bf16=torch.cuda.is_available(),     # Use mixed precision if available (from the YAML config)
    report_to="none",
    remove_unused_columns=True,         # Remove unused columns
    push_to_hub=False,                  # Don't push to Hugging Face Hub
    weight_decay=0.01,
)

# Tokenization function for SFT
max_seq_length = 1500  # From the YAML config

def tokenize_function(examples):
    inputs = examples["input"]
    outputs = examples["output"]

    # Combine input and output
    full_texts = []
    for inp, out in zip(inputs, outputs):
        full_texts.append(f"{inp}{out}")

    # Tokenize with the proper max_length
    tokenized = tokenizer(
        full_texts,
        padding="max_length",
        truncation=True,
        max_length=max_seq_length,
        return_tensors="pt"
    )

    # Create labels (same as input_ids for causal LM)
    tokenized["labels"] = tokenized["input_ids"].clone()

    return tokenized

# Tokenize the SFT dataset
tokenized_sft_dataset = sft_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["input", "output"]
)

# Initialize the trainer
sft_trainer = Trainer(
    model=model,
    args=sft_training_args,
    train_dataset=tokenized_sft_dataset,
    tokenizer=tokenizer,
)


Map:   0%|          | 0/25995 [00:00<?, ? examples/s]

  sft_trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [10]:

# Train the model
print("Starting SFT training...")
sft_trainer.train()

# Save the SFT model
sft_trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"SFT model saved to {output_dir}")

Starting SFT training...


It is strongly recommended to train Gemma2 models with the `eager` attention implementation instead of `sdpa`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.


KeyboardInterrupt: 

In [11]:

# Define custom reward function
def custom_reward_function(completions, examples, **kwargs):
    rewards = []

    for generated_pddl, example in zip(completions, examples):
        ground_truth_pddl = example["output"]  # Get ground truth PDDL

        try:
            # Use the Planetarium evaluate function
            parseable, solvable, equivalent = planetarium.evaluate.evaluate(
                ground_truth_pddl,
                generated_pddl
            )

            # Assign rewards based on quality
            if equivalent:
                reward = 1.0  # Correct PDDL -> highest reward
            elif solvable:
                reward = 0.5  # Solvable but incorrect -> medium reward
            elif parseable:
                reward = 0.2  # Parseable but not solvable -> low reward
            else:
                reward = 0.0  # Not even parseable -> no reward
        except Exception as e:
            print(f"Error evaluating PDDL: {e}")
            reward = 0.0  # Error case gets no reward

        rewards.append(reward)

    return rewards


In [12]:

# Prepare GRPO dataset
def prepare_grpo_dataset(dataset):
    """Prepare dataset for GRPO training."""

    prompts = []
    outputs = []

    for example in dataset:
        prompt = prepare_prompt(example)
        output = example['problem_pddl']

        prompts.append(prompt)
        outputs.append(output)

    return Dataset.from_dict({
        "prompt": prompts,
        "output": outputs
    })

# Prepare GRPO dataset
grpo_dataset = prepare_grpo_dataset(grpo_data)
print(f"GRPO dataset prepared with {len(grpo_dataset)} examples")

# Load the SFT model for GRPO training
grpo_model_path = output_dir
grpo_output_dir = "./grpo_model"
os.makedirs(grpo_output_dir, exist_ok=True)

# Load model and tokenizer from SFT checkpoint
model = AutoModelForCausalLM.from_pretrained(
    grpo_model_path,
    device_map="auto",
    torch_dtype="auto"
)
tokenizer = AutoTokenizer.from_pretrained(grpo_model_path)

# Define GRPO training arguments
grpo_training_args = GRPOConfig(
    output_dir=grpo_output_dir,
    per_device_train_batch_size=1,      # Process one sample at a time
    gradient_accumulation_steps=4,
    learning_rate=1e-5,                 # Lower learning rate for GRPO
    optim="adamw_torch",                # Optimizer
    num_train_epochs=1,                 # One epoch for GRPO
    max_prompt_length=512,
    max_completion_length=1024,
    bf16=torch.cuda.is_available(),
    logging_steps=10,
    num_generations=8,
    report_to="none",
    remove_unused_columns=False,
    weight_decay=0.01,
    max_seq_length=1500,
)

# Initialize GRPO trainer
grpo_trainer = GRPOTrainer(
    model=model,
    args=grpo_training_args,
    tokenizer=tokenizer,
    train_dataset=grpo_dataset,
    reward_funcs=[custom_reward_function]
)


GRPO dataset prepared with 103980 examples


ValueError: Unrecognized model in ./sft_model. Should have a `model_type` key in its config.json, or contain one of the following strings in its name: albert, align, altclip, aria, aria_text, audio-spectrogram-transformer, autoformer, bamba, bark, bart, beit, bert, bert-generation, big_bird, bigbird_pegasus, biogpt, bit, blenderbot, blenderbot-small, blip, blip-2, bloom, bridgetower, bros, camembert, canine, chameleon, chinese_clip, chinese_clip_vision_model, clap, clip, clip_text_model, clip_vision_model, clipseg, clvp, code_llama, codegen, cohere, cohere2, colpali, conditional_detr, convbert, convnext, convnextv2, cpmant, ctrl, cvt, dab-detr, dac, data2vec-audio, data2vec-text, data2vec-vision, dbrx, deberta, deberta-v2, decision_transformer, deformable_detr, deit, depth_anything, depth_pro, deta, detr, diffllama, dinat, dinov2, dinov2_with_registers, distilbert, donut-swin, dpr, dpt, efficientformer, efficientnet, electra, emu3, encodec, encoder-decoder, ernie, ernie_m, esm, falcon, falcon_mamba, fastspeech2_conformer, flaubert, flava, fnet, focalnet, fsmt, funnel, fuyu, gemma, gemma2, git, glm, glpn, got_ocr2, gpt-sw3, gpt2, gpt_bigcode, gpt_neo, gpt_neox, gpt_neox_japanese, gptj, gptsan-japanese, granite, granitemoe, granitemoeshared, granitevision, graphormer, grounding-dino, groupvit, helium, hiera, hubert, ibert, idefics, idefics2, idefics3, idefics3_vision, ijepa, imagegpt, informer, instructblip, instructblipvideo, jamba, jetmoe, jukebox, kosmos-2, layoutlm, layoutlmv2, layoutlmv3, led, levit, lilt, llama, llava, llava_next, llava_next_video, llava_onevision, longformer, longt5, luke, lxmert, m2m_100, mamba, mamba2, marian, markuplm, mask2former, maskformer, maskformer-swin, mbart, mctct, mega, megatron-bert, mgp-str, mimi, mistral, mixtral, mllama, mobilebert, mobilenet_v1, mobilenet_v2, mobilevit, mobilevitv2, modernbert, moonshine, moshi, mpnet, mpt, mra, mt5, musicgen, musicgen_melody, mvp, nat, nemotron, nezha, nllb-moe, nougat, nystromformer, olmo, olmo2, olmoe, omdet-turbo, oneformer, open-llama, openai-gpt, opt, owlv2, owlvit, paligemma, patchtsmixer, patchtst, pegasus, pegasus_x, perceiver, persimmon, phi, phi3, phimoe, pix2struct, pixtral, plbart, poolformer, pop2piano, prophetnet, pvt, pvt_v2, qdqbert, qwen2, qwen2_5_vl, qwen2_audio, qwen2_audio_encoder, qwen2_moe, qwen2_vl, rag, realm, recurrent_gemma, reformer, regnet, rembert, resnet, retribert, roberta, roberta-prelayernorm, roc_bert, roformer, rt_detr, rt_detr_resnet, rt_detr_v2, rwkv, sam, seamless_m4t, seamless_m4t_v2, segformer, seggpt, sew, sew-d, siglip, siglip_vision_model, speech-encoder-decoder, speech_to_text, speech_to_text_2, speecht5, splinter, squeezebert, stablelm, starcoder2, superglue, superpoint, swiftformer, swin, swin2sr, swinv2, switch_transformers, t5, table-transformer, tapas, textnet, time_series_transformer, timesformer, timm_backbone, timm_wrapper, trajectory_transformer, transfo-xl, trocr, tvlt, tvp, udop, umt5, unispeech, unispeech-sat, univnet, upernet, van, video_llava, videomae, vilt, vipllava, vision-encoder-decoder, vision-text-dual-encoder, visual_bert, vit, vit_hybrid, vit_mae, vit_msn, vitdet, vitmatte, vitpose, vitpose_backbone, vits, vivit, wav2vec2, wav2vec2-bert, wav2vec2-conformer, wavlm, whisper, xclip, xglm, xlm, xlm-prophetnet, xlm-roberta, xlm-roberta-xl, xlnet, xmod, yolos, yoso, zamba, zamba2, zoedepth

In [None]:
# Train with GRPO
print("Starting GRPO training...")
grpo_trainer.train()

# Save the GRPO model
grpo_trainer.save_model(grpo_output_dir)
print(f"GRPO model saved to {grpo_output_dir}")


In [None]:
def evaluate_model(model_path, test_dataset, num_samples=100):
    """Evaluate model on test dataset with simplified metrics."""

    # Load model and tokenizer
    model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    # Select a random subset if needed
    if num_samples < len(test_dataset):
        indices = random.sample(range(len(test_dataset)), num_samples)
        eval_dataset = test_dataset.select(indices)
    else:
        eval_dataset = test_dataset

    # Initialize counters
    total = len(eval_dataset)
    parseable_count = 0
    solvable_count = 0
    equivalent_count = 0

    # Process each test example
    for example in tqdm(eval_dataset, desc="Evaluating"):
        # Prepare prompt
        prompt = prepare_prompt(example)

        # Generate PDDL
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=1024,
                do_sample=True,
                temperature=0.7,
                top_p=0.9
            )

        # Extract generated PDDL
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        generated_pddl = generated_text[len(prompt):].strip()

        # Get ground truth
        ground_truth_pddl = example["problem_pddl"]
        domain = example["domain"]
        is_placeholder = example["is_placeholder"]

        # Evaluate
        try:
            parseable, solvable, equivalent = planetarium.evaluate.evaluate(
                ground_truth_pddl,
                generated_pddl,
                domain_str=domain,
                is_placeholder=is_placeholder
            )

            # Update counters
            if parseable:
                parseable_count += 1
            if solvable:
                solvable_count += 1
            if equivalent:
                equivalent_count += 1

        except Exception as e:
            print(f"Error evaluating example: {e}")

    # Calculate percentages
    parseable_pct = (parseable_count / total) * 100
    solvable_pct = (solvable_count / total) * 100
    equivalent_pct = (equivalent_count / total) * 100

    return {
        "total": total,
        "parseable_count": parseable_count,
        "solvable_count": solvable_count,
        "equivalent_count": equivalent_count,
        "parseable_pct": parseable_pct,
        "solvable_pct": solvable_pct,
        "equivalent_pct": equivalent_pct
    }

# Print evaluation results in a simplified format
def print_simple_results(results):
    print(f"Total examples: {results['total']}")
    print(f"Parseable: {results['parseable_count']} ({results['parseable_pct']:.2f}%)")
    print(f"Solvable: {results['solvable_count']} ({results['solvable_pct']:.2f}%)")
    print(f"Equivalent (correct): {results['equivalent_count']} ({results['equivalent_pct']:.2f}%)")