<a href="https://colab.research.google.com/github/ammiellewb/wataionboarding/blob/main/rlhf_w_ppo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# install dependencies
!pip install flake8 datasets transformers trl torch



In [None]:
import subprocess
import tempfile
from pathlib import Path
import itertools
import random
import torch
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    Trainer as ClsTrainer,
    TrainingArguments as ClsTrainingArguments,
)
from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# stream data
stream_ds = load_dataset(
    "codeparrot/codeparrot-clean",
    split="train",
    streaming=True
)

# shuffle a bit then take the first 500 examples
import random
random.seed(42)
shuffled = stream_ds.shuffle(buffer_size=10_000)  # small in-memory buffer
small_iter = itertools.islice(shuffled, 2000)

# print top 5 code lines
for idx, ex in enumerate(small_iter):
    print(idx, ex["content"][:50])
    if idx >= 5:
        break

Resolving data files:   0%|          | 0/54 [00:00<?, ?it/s]

0 #!/usr/bin/env python
# This file is part of Tryto
1 #!/usr/bin/env python
#
# This program is free sof
2 import os

from setuptools import setup, find_pack
3 import os
import numpy as np
from dipy.viz import 
4 from __future__ import unicode_literals

import co
5 # -*- coding: utf-8 -*-
#
# Junos Ansible Modules 


In [None]:
# compute a style score from 0.0 to 1.0 using flake8
def get_style_score(code: str, max_vios: int = 10) -> float:
  with tempfile.NamedTemporaryFile(suffix=".py", delete=False) as tf:
    tf.write(code.encode("utf-8"))
    tf.flush()
    path = tf.name
  result = subprocess.run(
      ["flake8", "--max-line-length=88", path],
      capture_output=True,
      text=True
  )
  vios = len(result.stdout.splitlines())
  Path(path).unlink()

  return max(0.0, 1.0-vios/max_vios)

In [None]:
# collect compliant snippts for fine-tuning
compliant_snippets = []

for ex in small_iter:
  code = ex["content"]
  # print(get_style_score(code))
  if get_style_score(code) == 1.0:
    compliant_snippets.append(code)
  if len(compliant_snippets) >= 200:
    break

# compliant_snippets

In [None]:
# collect 200 mixed snippets for labelled reward model
labeled_data = []
for ex in small_iter:
  code = ex["content"]
  label = int(get_style_score(code) == 1.0)
  labeled_data.append({"code": code, "label": label})
  if len(labeled_data) >= 400:
    break

In [None]:
# fine-tune CodeParrot on compliant snippets
tokenizer = AutoTokenizer.from_pretrained("codeparrot/codeparrot-small")
model = AutoModelForCausalLM.from_pretrained("codeparrot/codeparrot-small")

tokenizer.pad_token = tokenizer.eos_token
model.resize_token_embeddings(len(tokenizer))

tokenizer.save_pretrained("codeparrot-ft")

('codeparrot-ft/tokenizer_config.json',
 'codeparrot-ft/special_tokens_map.json',
 'codeparrot-ft/vocab.json',
 'codeparrot-ft/merges.txt',
 'codeparrot-ft/added_tokens.json',
 'codeparrot-ft/tokenizer.json')

In [None]:
# prepare dataset
encodings = tokenizer(
    compliant_snippets,
    truncation=True,
    padding="longest",
    return_tensors="pt"
)

In [None]:
class LMData(torch.utils.data.Dataset):
  def __init__(self, enc):
    self.input_ids = enc.input_ids
    self.attn_mask = enc.attention_mask
  def __len__(self): return len(self.input_ids)
  def __getitem__(self, idx): return {
      "input_ids": self.input_ids[idx],
      "attention_mask": self.attn_mask[idx],
      "labels": self.input_ids[idx]
  }

In [None]:
torch.cuda.empty_cache()

In [None]:
lm_dataset = LMData(encodings)
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

lm_args = TrainingArguments(
    output_dir="codeparrot-ft",
    per_device_train_batch_size=4,
    num_train_epochs=1,
    logging_steps=10,
    save_total_limit=1
)
lm_trainer = Trainer(
    model=model,
    args=lm_args,
    train_dataset=lm_dataset,
    data_collator=data_collator
)

lm_trainer.train()
model.save_pretrained("codeparrot-ft")



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mammiellewb[0m ([33mammiellewb-university-of-waterloo[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
10,1.5478
20,1.531
30,1.5258
40,1.4281
50,1.2961


In [None]:
# reward model: small classifier on style adherence
cls_tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
cls_model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

# prepare classification dataset
texts = [d["code"] for d in labeled_data]
labels = [d["label"] for d in labeled_data]
cls_enc = cls_tokenizer(texts, truncation=True, padding="longest", return_tensors="pt")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
class CLSData(torch.utils.data.Dataset):
  def __init__(self, enc, labels):
    self.input_ids = enc.input_ids
    self.attn_mask = enc.attention_mask
    self.labels = torch.tensor(labels)
  def __len__(self): return len(self.labels)
  def __getitem__(self, idx): return {"input_ids": self.input_ids[idx], "attention_mask": self.attn_mask[idx], "labels": self.labels[idx]}


In [None]:
cls_dataset = CLSData(cls_enc, labels)
cls_args = ClsTrainingArguments(output_dir="style-cls", per_device_train_batch_size=8, num_train_epochs=1, logging_steps=10, save_total_limit=1)
cls_trainer = ClsTrainer(model=cls_model, args=cls_args, train_dataset=cls_dataset)

cls_trainer.train()
cls_model.save_pretrained("style-cls")

# reward model (style classifier)
reward_model = cls_model.to(device)

Step,Training Loss
10,0.5381
20,0.393
30,0.4517
40,0.3817
50,0.3607


In [None]:
from datasets import Dataset

# prepare prompts for PPO
test_prompts = ["def add(a, b):", "class Person:", "def compute():", "def process_data(data):"]
raw_dataset = Dataset.from_dict({"query": test_prompts})

def tokenize_prompts(ex):
  output = tokenizer(ex["query"], truncation=True, padding="max_length", max_length=32)
  output["input_ids"] = output["input_ids"]
  output["attention_mask"] = output["attention_mask"]
  return output

train_dataset = raw_dataset.map(tokenize_prompts, batched=True, remove_columns=["query"])
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

In [None]:
# fine-tuned tokenizer
tokenizer = AutoTokenizer.from_pretrained("codeparrot-ft")
tokenizer.pad_token = tokenizer.eos_token

In [None]:
from trl import create_reference_model

# RLHF via PPO: 2 gradient updates
ppo_config = PPOConfig(
    output_dir="results/style-ppo",
    overwrite_output_dir=True,
    do_train=True,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    learning_rate=1.41e-5,

    # PPO-specific
    sft_model_path="codeparrot-ft",
    reward_model_path="style-cls",
    exp_name="style-ppo",
    batch_size=4,
    mini_batch_size=4,
    num_ppo_epochs=1,
    total_episodes=2 # 2 generate->update loops
)

ppo_model = AutoModelForCausalLMWithValueHead.from_pretrained("codeparrot-ft")

# reference copy
ref_model = create_reference_model(ppo_model)
ref_model.eval()  # no updates

AutoModelForCausalLMWithValueHead(
  (pretrained_model): GPT2LMHeadModel(
    (transformer): GPT2Model(
      (wte): Embedding(32768, 768)
      (wpe): Embedding(1024, 768)
      (drop): Dropout(p=0.1, inplace=False)
      (h): ModuleList(
        (0-11): 12 x GPT2Block(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): GPT2Attention(
            (c_attn): Conv1D(nf=2304, nx=768)
            (c_proj): Conv1D(nf=768, nx=768)
            (attn_dropout): Dropout(p=0.1, inplace=False)
            (resid_dropout): Dropout(p=0.1, inplace=False)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): GPT2MLP(
            (c_fc): Conv1D(nf=3072, nx=768)
            (c_proj): Conv1D(nf=768, nx=3072)
            (act): NewGELUActivation()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
      (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    )
    (lm_head): Line

In [None]:
from transformers import GenerationConfig

# attach a GenerationConfig
gen_conf = GenerationConfig(**ppo_model.config.to_dict())
ppo_model.generation_config = gen_conf
ref_model.generation_config = gen_conf

# attach base_model_prefix, matches config.model_type
ppo_model.base_model_prefix = "pretrained_model"
ref_model.base_model_prefix = "pretrained_model"

# models return dicts rather than tuples
ppo_model.config.return_dict = True
ref_model.config.return_dict = True
ppo_model.pretrained_model.config.return_dict = True
ref_model.pretrained_model.config.return_dict = True

ppo_model.to(device)
ref_model.to(device)

AutoModelForCausalLMWithValueHead(
  (pretrained_model): GPT2LMHeadModel(
    (transformer): GPT2Model(
      (wte): Embedding(32768, 768)
      (wpe): Embedding(1024, 768)
      (drop): Dropout(p=0.1, inplace=False)
      (h): ModuleList(
        (0-11): 12 x GPT2Block(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): GPT2Attention(
            (c_attn): Conv1D(nf=2304, nx=768)
            (c_proj): Conv1D(nf=768, nx=768)
            (attn_dropout): Dropout(p=0.1, inplace=False)
            (resid_dropout): Dropout(p=0.1, inplace=False)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): GPT2MLP(
            (c_fc): Conv1D(nf=3072, nx=768)
            (c_proj): Conv1D(nf=768, nx=3072)
            (act): NewGELUActivation()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
      (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    )
    (lm_head): Line

In [None]:
from transformers import default_data_collator

# instantiate PPOTrainer with required args
ppo_trainer = PPOTrainer(
    ppo_config,
    tokenizer,
    ppo_model,
    ref_model,
    reward_model,
    train_dataset,
    ppo_model,
    data_collator=default_data_collator
  )

In [None]:
# defining reward function using reward model
def reward_fn(responses):
  texts = [tokenizer.decode(r, skip_special_tokens=True) for r in responses]
  return [torch.tensor(get_style_score(t), device=device) for t in texts]

In [None]:
# before RLHF: baseline generation
test_prompts = ["def add(a, b):", "class Person:", "def compute():", "def process_data(data):"]
baseline_scores = []

tokenizer.padding_side = "left"
# tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id

for q in test_prompts:
    enc = tokenizer(q, return_tensors="pt", padding=True, truncation=True).to(device)
    out = ppo_model.generate(**enc, max_length=50, pad_token_id=tokenizer.pad_token_id)[0]
    baseline_scores.append(get_style_score(tokenizer.decode(out, skip_special_tokens=True)))

print(baseline_scores)

[0.9, 0.9, 0.9, 0.9]


In [None]:
# run 2 PPO gradient updates
ppo_trainer.train()

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


===training policy===


AttributeError: 'tuple' object has no attribute 'logits'

In [None]:
# after RLHF: post-PPO generation
post_scores = []

for q in test_prompts:
    enc = tokenizer(q, return_tensors="pt", padding=True, truncation=True).to(device)
    out = ppo_model.generate(**enc, max_length=50, pad_token_id=tokenizer.pad_token_id)[0]
    post_scores.append(get_style_score(tokenizer.decode(out, skip_special_tokens=True)))

print(post_scores)

===training policy===


TypeError: 'NoneType' object is not subscriptable

In [None]:
# eval: comparing style scores
for i, q in enumerate(test_prompts):
  print(f"Prompt: {q}")
  print(f"Baseline score: {baseline_scores[i]}")
  print(f"Post-RLHF score: {post_scores[i]})