# Compatibility Prediction

In [None]:
import os
import sys
import logging
from pathlib import Path
from datasets import concatenate_datasets
from datasets import load_dataset
from peft import LoraConfig
from trl import GRPOConfig, GRPOTrainer
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainerCallback,
)
from transformers.integrations import MLflowCallback
from types import SimpleNamespace
import transformers
from tqdm import tqdm

print(f"Transformers Version: {transformers.__version__}")
print(f"Torch Version: {torch.__version__}")
if torch.cuda.is_available():
    print(f"{torch.cuda.device_count()} CUDA ({torch.version.cuda}) device available")
else:
    print("No CUDA device available")

2025-03-03 03:06:58,318	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
No module named 'vllm._version'
  from vllm.version import __version__ as VLLM_VERSION


Transformers Version: 4.47.1
Torch Version: 2.4.0+cu121
4 CUDA (12.1) device available


In [None]:
output_string = 'compatibility_text_based'
grpo_train_file = f"{output_string}_grpo_train_14272.jsonl"
grpo_dev_file = f"{output_string}_grpo_dev_1352.jsonl"
grpo_test_file = f"{output_string}_grpo_test_73213.jsonl"
data_dir = "/home/azureuser/localfiles/data/polyvore_cp"

data_files = {"train": os.path.join(data_dir, grpo_train_file),
              "dev": os.path.join(data_dir, grpo_dev_file),
              }
dataset = load_dataset("json", data_files=data_files)
dataset

Generating train split: 14272 examples [00:00, 79199.80 examples/s]
Generating dev split: 1352 examples [00:00, 92134.58 examples/s]


DatasetDict({
    train: Dataset({
        features: ['split', 'num_items', 'gt', 'prompt', 'completion', 'prompt_tokens', 'completion_tokens'],
        num_rows: 14272
    })
    dev: Dataset({
        features: ['split', 'num_items', 'gt', 'prompt', 'completion', 'prompt_tokens', 'completion_tokens'],
        num_rows: 1352
    })
})

In [6]:
dataset['train']

Dataset({
    features: ['split', 'num_items', 'gt', 'prompt', 'completion', 'prompt_tokens', 'completion_tokens'],
    num_rows: 14272
})

In [13]:
arg_dict = {'model_name_or_path': "microsoft/Phi-3.5-mini-instruct",
            'max_seq_len': 1100,
            'output_dir': "/home/azureuser/localfiles/models/grpo/",
            'max_prompt_length': 800,
            'max_completion_length': 256,
            'lr': 5e-6,
            'num_epochs': 1,
            'batch_size': 8,
            'grad_accumulation_steps': 8,
            'num_generations': 8,
            'lora_r': 16,
            'lora_alpha': 32,
            'lora_dropout': 0.05,
            'target_modules': ['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"],
            'logging_steps': 100,
            'train_log_filename': "/home/azureuser/localfiles/data/polyvore_cp/grpo_training.log"
           }
args = SimpleNamespace(**arg_dict)

In [14]:
logger = logging.getLogger("trainer_logger")
logger.setLevel(logging.INFO)

# File handler for logging
file_handler = logging.FileHandler(args.train_log_filename)
file_handler.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s - %(message)s")
file_handler.setFormatter(formatter)

logger.addHandler(file_handler)
logger.propagate = False

class LogMetricsCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        logs = logs or {}
        logger.info(f"Step: {state.global_step}, Metrics: {logs}")

    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        metrics = metrics or {}
        logger.info(f"Step: {state.global_step}, Eval Metrics: {metrics}")


In [15]:
model_kwargs = {
    "use_cache": False,
    "trust_remote_code": False,
    "attn_implementation": "flash_attention_2",
    "torch_dtype": torch.bfloat16,
    "device_map": "auto",
}
model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, **model_kwargs)
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.23it/s]


In [16]:
tokenizer.model_max_length = args.max_seq_len
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.unk_token  # Use unk rather than eos token to prevent endless generation
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)

In [17]:
import re

def format_reward_func(completions, **kwargs):
    """Reward function that checks if the completion has a specific format."""
    pattern = r"^<think>.*?</think><answer>.*?</answer>$"
    completion_contents = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, content) for content in completion_contents]
    return [1.0 if match else 0.0 for match in matches]


def reward_fun(completions, gt, **kwargs):
    # check whether the content within <answer> and </answer> matches
    # the ground truth, Note: completions are the predicted completions
    answer_pattern = r"<answer>(.*?)</answer>"
    completion_contents = [completion[0]["content"] for completion in completions]
    reward_score = []
    for solution_str, label in zip(completion_contents, gt):
        match = re.finditer(answer_pattern, solution_str)
        matches = list(match)
        if matches:
            final_answer = matches[-1].group(1).strip()
        else:
            final_answer = None
        if final_answer is not None and final_answer == label:
            reward_score.append(1)
        else:
            reward_score.append(0)
    return reward_score

In [18]:
training_args = GRPOConfig(
    output_dir=args.output_dir,
    logging_steps=args.logging_steps,
    use_vllm=False,
    learning_rate=5e-6,
    num_train_epochs=args.num_epochs,
    per_device_train_batch_size=args.batch_size,
    gradient_accumulation_steps=args.grad_accumulation_steps,
    num_generations=args.num_generations,
    max_prompt_length=args.max_prompt_length,
    max_completion_length=args.max_completion_length,
    temperature=1,
    weight_decay=0.1,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    bf16=True,
    save_strategy="no",
    torch_empty_cache_steps=None,  # Disable torch empty cache steps
    torch_compile=False,          # Disable torch compilation
)

peft_config = {
            "r": args.lora_r,
            "lora_alpha": args.lora_alpha,
            "lora_dropout": args.lora_dropout,
            "bias": "none",
            "task_type": "CAUSAL_LM",
            "target_modules": args.target_modules,
            "modules_to_save": None,
        }
peft_conf = LoraConfig(**peft_config)

trainer = GRPOTrainer(
    model=model,
    processing_class=tokenizer,
    reward_funcs=[reward_fun, format_reward_func],
    args=training_args,
    train_dataset=dataset['train'],
    peft_config=peft_conf,
    callbacks=[LogMetricsCallback()],
)

In [19]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33maeroabir[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


IsADirectoryError: [Errno 21] Is a directory: '\n    <div>\n      \n      <progress value=\'2\' max=\'1784\' style=\'width:300px; height:20px; vertical-align: middle;\'></progress>\n      [   2/1784 : < :, Epoch 0.00/1]\n    </div>\n    <table border="1" class="dataframe">\n  <thead>\n <tr style="text-align: left;">\n      <th>Step</th>\n      <th>Training Loss</th>\n    </tr>\n  </thead>\n  <tbody>\n  </tbody>\n</table><p>'