In [1]:
from itertools import chain

import numpy as np
import tiktoken
from datasets import Dataset, concatenate_datasets, load_dataset
from pandas import Series

In [2]:
def build_dialogue(group):
    messages = [[{"role": "user", "content": u}, {"role": "assistant", "content": a}] for u, a in zip(group["user"], group["assistant"])]
    return Series({"messages": list(chain(*messages))})


def build_messages(example):
    question = [{"role": "user", "content": example["question"]}]
    chosen = [{"role": "assistant", "content": example["personalized_answer"]}]
    return {"messages": question + chosen}


def count_tokens(example, tokenizer, feature):
    tokens = 0
    for message in example[feature]:
        tokens += 3
        for k, v in message.items():
            tokens += len(tokenizer.encode(v))
            if k == "name":
                tokens += 1
    tokens += 3  # every reply is primed with <|start|>assistant<|message|>
    return {f"{feature}_tokens": tokens}

In [3]:
dialogue_dict = load_dataset("arranonymsub/HiCUPID", name="dialogue")
dialogue = concatenate_datasets(list(dialogue_dict.values()))
by_type = Dataset.from_pandas(dialogue.to_pandas().groupby(["user_id", "type"]).apply(build_dialogue, include_groups=False))
by_user = Dataset.from_pandas(dialogue.to_pandas().groupby("user_id").apply(build_dialogue, include_groups=False))
qa_dict = load_dataset("arranonymsub/HiCUPID", name="qa")
qa = concatenate_datasets(list(qa_dict.values()))
qa = qa.map(build_messages, num_proc=16)

In [4]:
tokenizer = tiktoken.encoding_for_model("gpt-2")
by_type = by_type.map(count_tokens, fn_kwargs={"tokenizer": tokenizer, "feature": "messages"}, num_proc=16)
persona = by_type.filter(lambda x: x["type"] == "persona", num_proc=16)
profile = by_type.filter(lambda x: x["type"] == "profile", num_proc=16)
schedule = by_type.filter(lambda x: x["type"] == "schedule", num_proc=16)
by_user = by_user.map(count_tokens, fn_kwargs={"tokenizer": tokenizer, "feature": "messages"}, num_proc=16)
qa = qa.map(count_tokens, fn_kwargs={"tokenizer": tokenizer, "feature": "messages"}, num_proc=16)

print(f"Dialogue Length (Persona): {np.mean(persona['messages_tokens']):.1f} +- {np.std(persona['messages_tokens'], ddof=1):.1f}")
print(f"Dialogue Length (Profile): {np.mean(profile['messages_tokens']):.1f} +- {np.std(profile['messages_tokens'], ddof=1):.1f}")
print(f"Dialogue Length (Schedule): {np.mean(schedule['messages_tokens']):.1f} +- {np.std(schedule['messages_tokens'], ddof=1):.1f}")
print(f"Dialogue Length (Whole): {np.mean(by_user['messages_tokens']):.1f} +- {np.std(by_user['messages_tokens'], ddof=1):.1f}")
print(f"QA Length: {np.mean(qa['messages_tokens']):.1f} +- {np.std(qa['messages_tokens'], ddof=1):.1f}")

Map (num_proc=16):   0%|          | 0/4500 [00:00<?, ? examples/s]

Filter (num_proc=16):   0%|          | 0/4500 [00:00<?, ? examples/s]

Filter (num_proc=16):   0%|          | 0/4500 [00:00<?, ? examples/s]

Filter (num_proc=16):   0%|          | 0/4500 [00:00<?, ? examples/s]

Map (num_proc=16):   0%|          | 0/1500 [00:00<?, ? examples/s]

Dialogue Length (Persona): 15962.3 +- 538.1
Dialogue Length (Profile): 329.0 +- 31.4
Dialogue Length (Schedule): 970.9 +- 50.7
Dialogue Length (Whole): 17256.3 +- 543.7
QA Length: 57.3 +- 17.9
