In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append("/home/rame/trl/examples/llama")

In [2]:
import os
import numpy as np
import torch
from datasets import load_dataset

from trl import PPOConfig, PPOTrainer, set_seed
from trl.core import LengthSampler

import llama_utils, ppo_utils, args_utils

MIN_SIZE = 100


def build_dataset(dataset_name, *args, **kwargs):
    if dataset_name == "news":
        return _build_news_dataset(*args, **kwargs)
    else:
        return _build_openai_dataset(*args, **kwargs)


def _build_news_dataset(tokenizer, split="train", max_size=1500):
    """
    Args:
        dataset_name (`str`): "argilla/news-summary"
    """
    split = {"train": "test", "validation": "train"}[split]
    ds = load_dataset("argilla/news-summary", name="comparisons", split=split, use_auth_token=True)
    ds_filtered = ds.filter(
        lambda x: x["text"] is not None and MIN_SIZE < len(x["text"]) < max_size and x["id"] is
        not None,
        batched=False
    )

    def remove_duplicate(duplicated_dataset):
        initial_list = duplicated_dataset.map(lambda x: {"id": x['id']})
        _, unique_indices = np.unique(initial_list["id"], return_index=True, axis=0)
        filtered_dataset = duplicated_dataset.select(unique_indices.tolist())
        return filtered_dataset

    ds_deduplicated = remove_duplicate(ds_filtered)
    input_size_sampler = LengthSampler(2, 8)

    def tokenize(sample):
        info_post = "-".join(sample["text"].replace("\n", " ").split("(Reuters) -")[1:]).strip()
        prompt_summary = llama_utils.Instructions.get_prompt_summary(post=info_post)
        size_prompt_summary = len(tokenizer.encode(prompt_summary)) - 1
        input_size = size_prompt_summary + input_size_sampler()
        choice = 0  # select the best summary
        response = sample["prediction"][choice]["text"].replace("\n", " ").replace(".", ",")
        sample["input_ids"] = tokenizer.encode(prompt_summary + response)[:input_size]
        sample["query"] = tokenizer.decode(sample["input_ids"])
        return sample

    ds_mapped = ds_deduplicated.map(tokenize, batched=False, load_from_cache_file=False)
    ds_mapped.set_format(type="torch")
    return ds_mapped

  from .autonotebook import tqdm as notebook_tqdm



Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: Required library version not found: libsbitsandbytes_cpu.so. Maybe you need to compile it from source?
CUDA SETUP: Defaulting to libbitsandbytes_cpu.so...
libcurand.so.10: cannot open shared object file: No such file or directory


  warn("The installed version of bitsandbytes was compiled without GPU support. "


In [3]:
tokenizer = llama_utils.Tokenizer.load_tokenizer("decapoda-research/llama-7b-hf")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'LLaMATokenizer'. 
The class this function is called from is 'LlamaTokenizer'.


In [4]:
dataset = build_dataset(tokenizer=tokenizer, dataset_name="news")

Found cached dataset parquet (/home/rame/.cache/huggingface/datasets/argilla___parquet/argilla--news-summary-46ccad7a40bceec1/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /home/rame/.cache/huggingface/datasets/argilla___parquet/argilla--news-summary-46ccad7a40bceec1/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-4641da6c18afa7bf.arrow
Loading cached processed dataset at /home/rame/.cache/huggingface/datasets/argilla___parquet/argilla--news-summary-46ccad7a40bceec1/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-8a4223dbadb9a091.arrow
                                                                                                                                                                                                                                                            

In [6]:
print(dataset)

Dataset({
    features: ['text', 'prediction', 'prediction_agent', 'annotation', 'annotation_agent', 'id', 'metadata', 'status', 'event_timestamp', 'metrics', 'input_ids', 'query'],
    num_rows: 6971
})


In [None]:

dataset.set_format("pandas")
# df_batch = ds[:].sample(bs)
df_batch = dataset[:10]

In [None]:
df_batch["query"][0]

In [None]:
dataset["train"][:10]

In [None]:
[d["query"] for d in dataset[:10]]

In [None]:
ds = load_dataset("openai/summarize_from_feedback", name="comparisons", split="train")

In [None]:
dsf = ds.filter(lambda x: len(x["info"]["post"]) < 1200, batched=False)

In [None]:
dsf

In [None]:
"\na \n aljkka \n".strip()

In [None]:
ds["summaries"][0]

In [None]:
load_dataset??

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
  
tokenizer = AutoTokenizer.from_pretrained("CogComp/bart-faithful-summary-detector")

article = "Ban Ki-Moon was re-elected for a second term by the UN General Assembly, unopposed and unanimously, on 21 June 2011."

bad_summary = "Ban Ki-moon was elected for a second term in 2007."
good_summary = "Ban Ki-moon was elected for a second term in 2011."

bad_pair = tokenizer(text=bad_summary, text_pair=article, return_tensors='pt')
good_pair = tokenizer(text=good_summary, text_pair=article, return_tensors='pt')

model = AutoModelForSequenceClassification.from_pretrained("CogComp/bart-faithful-summary-detector")

bad_score = model(**bad_pair)
good_score = model(**good_pair)
good_score[0][:, 1]
bad_score[0][:, 1]



In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
sys.path.append("/home/rame/trl/examples/llama")

In [None]:
import llama_utils

In [None]:
import torch

In [None]:
device = 0 if torch.cuda.is_available() else "cpu"

In [None]:
sentiment_pipe = llama_utils.Pipelines.load_pipe(sentiment_model="Tristan/gpt2_reward_summarization", device=device)

In [None]:
sentiment_pipe_v2 = llama_utils.Pipelines.load_pipe(sentiment_model="CogComp/bart-faithful-summary-detector", device=device)

In [None]:
texts = [
    "Ban Ki-moon was a very good president.",
    "Ban Ki-moon was elected for a second term in 2011.",
    "Zinedine Yazid Zidane, popularly known as Zizou, is a French professional football manager and former player who played as an attacking midfielder."
    
]
article = "Ban Ki-Moon was re-elected for a second term by the UN General Assembly, unopposed and unanimously, on 21 June 2011."

In [None]:
texts_v1 = [
    llama_utils.transform_text(
        sentiment_pipe=sentiment_pipe,
        response_text=text,
        instruction=article
    ) for text in texts
]

In [None]:
texts_v2 = [
    llama_utils.transform_text(
        sentiment_pipe=sentiment_pipe,
        response_text=text,
        instruction=article
    ) for text in texts
]

In [None]:
sent_kwargs = {"return_all_scores": True, "function_to_apply": "none", "batch_size": 1}

In [None]:
sentiment_pipe(texts_v1, **sent_kwargs)

In [None]:
sentiment_pipe_v2(texts_v2, **sent_kwargs)

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("Tristan/gpt2_reward_summarization")
model = AutoModelForSequenceClassification.from_pretrained("Tristan/gpt2_reward_summarization")


bad_summary = "Ban Ki-moon was a very good president."
good_summary = 


bad_input_ids = tokenizer.encode("summary:"+ bad_summary + tokenizer.eos_token + "article:" + article)
bad_score = model(input_ids=torch.tensor([bad_input_ids]))[0]
print("bad", bad_score[0].detach())
good_input_ids = tokenizer.encode("summary:" + good_summary + tokenizer.eos_token + "article:" + article)
good_score = model(input_ids=torch.tensor([good_input_ids]))[0]
print("good", good_score[0].detach())

In [None]:
bad_summary = 

In [None]:

response_text = good_summary + " " + tokenizer.bos_token + " " + article

In [None]:
bad_input_ids = tokenizer.encode(bad_summary + " " + tokenizer.bos_token + " " + article)
bad_score = model(input_ids=torch.tensor([bad_input_ids]))[0]
print("bad", bad_score[0].detach())

In [None]:
good_input_ids = tokenizer.encode(response_text)
good_score = model(input_ids=torch.tensor([good_input_ids]))[0]
print("good", good_score[0].detach())

In [None]:
def turn_into_text_classification_format(examples):
    new_examples = {"text_j": [], "text_k": []}
    for info, summaries, choice in zip(examples["info"], examples["summaries"], examples["choice"]):
        if len(summaries) != 2 or choice not in (0, 1):
            raise ValueError(
                f"There should be two summaries with a choice that's either 0 or 1. Received {len(summaries)} summaries and choice={choice}."
            )
        original_text_field = "post" if info["post"] is not None else "article"
        new_examples["text_j"].append(
            summaries[choice]["text"] + " " + tokenizer.bos_token + " " + info[original_text_field]
        )
        new_examples["text_k"].append(
            summaries[0 if choice == 1 else 1]["text"] + " " + tokenizer.bos_token + " " + info[original_text_field]
        )

    return new_examples