In [2]:
!pip install transformers datasets trl -qU

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/318.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m318.4/318.4 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/105.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.9/105.9 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
# from google.colab import drive
# drive.mount('/content/drive',force_remount=True)

In [4]:
# !git clone https://github.com/ashutoshtiwari13/trl.git

In [5]:
import os
os.environ['HF_TOKEN'] = '<hidden>'

#wandb token = <hidden>

In [6]:
# import sys
# sys.path.append('/content/trl')

In [7]:
import trl
print(trl.__file__ )

/usr/local/lib/python3.10/dist-packages/trl/__init__.py


In [8]:
# %cd /content/trl

In [9]:
# !pip install -r requirements.txt

In [10]:
import torch
from transformers import AutoTokenizer,AutoModelForCausalLM
from datasets import load_dataset
from trl import PPOConfig, PPOTrainer,AutoModelForCausalLMWithValueHead
from trl.core import LengthSampler
import numpy as np
from typing import List, Dict

In [64]:
class SymbolicEnvironment:
    def __init__(self, tokenizer, sentiment_analyzer):
        self.tokenizer = tokenizer
        self.sentiment_analyzer = sentiment_analyzer

    def analyze_sentiment(self, text: str) -> float:
        return self.sentiment_analyzer(text)[0]['score']

    def get_vectorized_feedback(self, prompt: str, response: str) -> List[float]:
        full_text = prompt + response
        tokens = self.tokenizer.tokenize(full_text)
        sentiment_score = self.analyze_sentiment(full_text)

        feedback = [sentiment_score if sentiment_score > 0.5 else 1 - sentiment_score] * len(tokens)
        return feedback

In [76]:
class RLSFTrainer:
    def __init__(self, config: PPOConfig, model_name: str, symbolic_env: SymbolicEnvironment):
        self.config = config
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token

        model = AutoModelForCausalLM.from_pretrained(model_name)
        self.model = AutoModelForCausalLMWithValueHead.from_pretrained(model)
        self.ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(model)
        self.symbolic_env = symbolic_env
        self.ppo_trainer = PPOTrainer(config, self.model, self.ref_model, self.tokenizer)

    def train(self, dataset, num_epochs: int):
        for epoch in range(num_epochs):
            for batch in dataset:
                query_tensors = batch["input_ids"]
                # attention_mask = batch["attention_mask"]
                labels = batch["label"]

                print("Debug: query_tensors shape:", query_tensors.shape)
                # print("Debug: attention_mask shape:", attention_mask.shape)

                # Convert query_tensors to a list of 1D tensors
                query_tensors_list = [tensor for tensor in query_tensors]
                print("Debug: query_tensors_list length:", len(query_tensors_list))
                print("Debug: first tensor in list shape:", query_tensors_list[0].shape)

                # Prepare inputs for generate
                generate_kwargs = {
                    # "attention_mask": attention_mask,
                    "max_new_tokens": 20,
                    "do_sample": True,
                    "top_k": 0,
                    "top_p": 1.0,
                    "temperature": 1.0
                }

                try:
                    response_tensors = self.ppo_trainer.generate(query_tensors_list, **generate_kwargs)
                    print("Debug: generate successful")
                except Exception as e:
                    print(f"Debug: generate failed with error: {str(e)}")
                    continue

                print("Debug: response_tensors type:", type(response_tensors))
                if isinstance(response_tensors, list):
                    print("Debug: response_tensors list length:", len(response_tensors))
                    print("Debug: first response tensor shape:", response_tensors[0].shape)
                else:
                    print("Debug: response_tensors shape:", response_tensors.shape)

                prompts = self.tokenizer.batch_decode(query_tensors, skip_special_tokens=True)
                responses = self.tokenizer.batch_decode(response_tensors, skip_special_tokens=True)

                rewards = []
                for prompt, response, label in zip(prompts, responses, labels):
                    feedback = self.symbolic_env.get_vectorized_feedback(prompt, response)
                    rewards.append(torch.tensor(feedback))

                # Ensure rewards match the shape of response_tensors
                max_len = max(len(r) for r in rewards)
                rewards = [torch.nn.functional.pad(r, (0, max_len - len(r)), value=r[-1]) for r in rewards]
                rewards = torch.stack(rewards)

                print("Debug: rewards shape:", rewards.shape)

                # Convert response_tensors to a tensor if it's a list
                if isinstance(response_tensors, list):
                    response_tensors = torch.stack(response_tensors)

                try:
                    self.ppo_trainer.step(query_tensors, response_tensors, rewards)
                    print("Debug: PPO step successful")
                except Exception as e:
                    print(f"Debug: PPO step failed with error: {str(e)}")

            print(f"Epoch {epoch + 1} completed")

    def evaluate(self, dataset) -> Dict[str, float]:
        correct = 0
        total = 0
        for batch in dataset:
            query_tensors = batch["input_ids"]
            # attention_mask = batch["attention_mask"]
            labels = batch["label"]

            # Convert query_tensors to a list of 1D tensors
            query_tensors_list = [tensor for tensor in query_tensors]

            # Prepare inputs for generate
            generate_kwargs = {
                # "attention_mask": attention_mask,
                "max_new_tokens": 20,
                "do_sample": False
            }

            response_tensors = self.ppo_trainer.generate(query_tensors_list, **generate_kwargs)

            prompts = self.tokenizer.batch_decode(query_tensors, skip_special_tokens=True)
            responses = self.tokenizer.batch_decode(response_tensors, skip_special_tokens=True)

            for prompt, response, label in zip(prompts, responses, labels):
                sentiment_score = self.symbolic_env.analyze_sentiment(prompt + response)
                predicted_label = 1 if sentiment_score > 0.5 else 0
                if predicted_label == label.item():
                    correct += 1
                total += 1

        accuracy = correct / total
        return {"accuracy": accuracy}

In [77]:
def prepare_dataset(dataset, tokenizer, max_length=512):
    def tokenize_and_truncate(examples):
        tokenized = tokenizer(
            examples['text'],
            truncation=True,
            max_length=max_length,
            padding='max_length',
            return_tensors="pt"
        )
        # Remove attention_mask from tokenized output
        tokenized.pop('attention_mask', None)
        # Add the label to the tokenized output
        tokenized['label'] = examples['label']
        return tokenized

    # Keep the 'label' column when removing columns
    columns_to_remove = [col for col in dataset.column_names if col not in ['label']]
    dataset = dataset.map(tokenize_and_truncate, batched=True, remove_columns=columns_to_remove)

    # Set the format for all columns including 'label', but excluding attention_mask
    dataset.set_format(type='torch', columns=['input_ids', 'label'])

    print("Debug: Dataset preparation complete")
    print("Debug: Dataset columns:", dataset.column_names)
    print("Debug: Dataset features:", dataset.features)

    # Print shape of first item in dataset
    first_item = dataset[0]
    print("Debug: Shape of first item:")
    for key, value in first_item.items():
        if isinstance(value, torch.Tensor):
            print(f"  {key}: {value.shape}")
        else:
            print(f"  {key}: {type(value)}")

    return dataset

train_dataset = load_dataset("imdb", split="train[:1000]")
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

train_dataset = prepare_dataset(train_dataset, tokenizer)
train_dataset

Debug: Dataset preparation complete
Debug: Dataset columns: ['label', 'input_ids']
Debug: Dataset features: {'label': ClassLabel(names=['neg', 'pos'], id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None)}
Debug: Shape of first item:
  label: torch.Size([])
  input_ids: torch.Size([512])


Dataset({
    features: ['label', 'input_ids'],
    num_rows: 1000
})

In [79]:
def main():
    # Load dataset
    train_dataset = load_dataset("imdb", split="train[:1000]")
    test_dataset = load_dataset("imdb", split="test[:1000]")

    # Initialize tokenizer
    tokenizer = AutoTokenizer.from_pretrained("gpt2")
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    # Prepare datasets
    train_dataset = prepare_dataset(train_dataset, tokenizer)
    test_dataset = prepare_dataset(test_dataset, tokenizer)

    # Initialize config
    config = PPOConfig(
        model_name="gpt2",
        learning_rate=1e-5,
        batch_size=4,
        mini_batch_size=2,
        gradient_accumulation_steps=1,
    )

    # Initialize symbolic environment
    from transformers import pipeline
    sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english", device=0 if torch.cuda.is_available() else -1)
    symbolic_env = SymbolicEnvironment(tokenizer, sentiment_analyzer)

    # Initialize and train RLSF model
    rlsf_trainer = RLSFTrainer(config, "gpt2", symbolic_env)
    rlsf_trainer.train(train_dataset, num_epochs=5)

    # Evaluate RLSF model
    rlsf_results = rlsf_trainer.evaluate(test_dataset)
    print("RLSF Results:", rlsf_results)

    # Initialize and train standard RL model (without symbolic feedback)
    standard_config = PPOConfig(
        model_name="gpt2",
        learning_rate=1e-5,
        batch_size=4,
        mini_batch_size=2,
        gradient_accumulation_steps=1,
    )
    standard_trainer = RLSFTrainer(standard_config, "gpt2", None)
    standard_trainer.train(train_dataset, num_epochs=5)

    # Evaluate standard RL model
    standard_results = standard_trainer.evaluate(test_dataset)
    print("Standard RL Results:", standard_results)

if __name__ == "__main__":
    main()