In [None]:
from datasets import load_dataset
import polars as pl
from huggingface_hub import snapshot_download
import os
from typing import Sequence
from openai.types.chat.chat_completion_message_param import ChatCompletionMessageParam
from datasets import Dataset
from pathlib import Path
from pprint import pprint
from IPython.display import Markdown, display

In [None]:
for dataset_name in [
    # "lara-martin/FIREBALL",
    # "MinervaAI/Aesir-Preview",
    # "hieunguyenminh/roleplay",
    # "chargoddard/rpguild",
    # "jondurbin/cinematika-v0.1",
    # "codeparrot/apps",
    # "glaiveai/glaive-code-assistant-v3",
    # "jondurbin/py-dpo-v0.1",
    # "lemonilia/roleplaying-forums-raw",
    # "deepmind/code_contests",
    # "SenseLLM/ReflectionSeq-DS",
    # "openai/openai_humaneval",
    # "argilla/distilabel-intel-orca-dpo-pairs"
    # not downloaded
    # Squish42/bluemoon-fandom-1-1-rp-cleaned
    "SaylorTwift/Gutenberg"
]:
    # run this fn to get the local directory of the dataset
    snapshot_download(dataset_name, repo_type="dataset")

In [None]:
files = snapshot_download("lemonilia/roleplaying-forums-raw", repo_type="dataset")
files

In [None]:
cinematika_root_dir = snapshot_download(
    "jondurbin/cinematika-v0.1", repo_type="dataset"
)
os.listdir(cinematika_root_dir)

In [None]:
for filename in os.listdir(cinematika_root_dir):
    file_path = os.path.join(cinematika_root_dir, filename)
    _, extension = os.path.splitext(file_path)
    print(filename)
    if extension == ".parquet":
        dataset = Dataset.from_parquet(file_path)
        print(len(dataset))

In [78]:
scene_by_scene_dataset = Dataset.from_parquet(
    os.path.join(cinematika_root_dir, "scene_by_scene.parquet")
)
actions_dataset = Dataset.from_parquet(
    os.path.join(cinematika_root_dir, "actions.parquet")
)
plain_scenes_dataset = Dataset.from_parquet(
    os.path.join(cinematika_root_dir, "full_script.parquet")
)

In [None]:
rp_root_dir = snapshot_download("chargoddard/rpguild", repo_type="dataset")
print(rp_root_dir)
os.listdir(rp_root_dir)
print(open(os.path.join(rp_root_dir, "README.md")).read())

In [28]:
rp_grammar_filtered_path = os.path.join(
    rp_root_dir, "grammar_filtered", "train-00000-of-00001.parquet"
)
rp_grammar_filtered_dataset = pl.read_parquet(rp_grammar_filtered_path)

In [None]:
rp_grammar_filtered_dataset

In [None]:
from typing import Dict, List


def dictl(dict_of_lists: Dict[str, List]) -> Sequence[dict]:
    """
    Dict of lists to list of dicts.
    """
    return [dict(zip(dict_of_lists.keys(), t)) for t in zip(*dict_of_lists.values())]


for i, sample in enumerate(rp_grammar_filtered_dataset.iter_rows(named=True)):
    print(sample["username"], sample["reply"])
    print(sample.keys())
    display(Markdown(f"### Sample {i}"))
    for msg in sample["context"]:
        display(Markdown(f"**{msg['char_name']}**\n\n {msg['text']}"))
    if i > 10:
        break

In [None]:
pippa_dataset = load_dataset(
    "PygmalionAI/PIPPA", "pippa_deduped", trust_remote_code=True
)
pippa_dataset = pippa_dataset["train"].to_polars()

In [None]:
from collections import defaultdict
from dataclasses import dataclass
import re
from typing import Literal, Optional, List
from tqdm import tqdm


@dataclass
class DialogueLine:
    character: str
    content: str
    is_human: bool


def format_conversation_oai(sample: dict):
    # TODO handle interpolation and character intros
    conversation = sample["conversation"]
    description = sample["bot_description"]
    out_conv: Sequence[ChatCompletionMessageParam] = [
        {"role": "assistant", "content": description}
    ]
    for msg, is_human in zip(conversation["message"], conversation["is_human"]):
        out_conv.append({"role": "user" if is_human else "assistant", "content": msg})
    print(out_conv)
    return out_conv


for sample in dataset_pl.iter_rows(named=True):
    print(sample.keys())
    format_conversation_oai(sample)
    break

all_convs_out = []
for i, row in enumerate(tqdm(pippa_dataset.iter_rows(named=True))):
    conv_in, bot_description, bot_name = (
        row["conversation"],
        row["bot_description"],
        row["bot_name"],
    )
    conv_out: list[ChatCompletionMessageParam] = [
        {
            "role": "system",
            "content": f"You are talking to {bot_name}. {bot_description}",
        }
    ]
    for msg, is_human in zip(conv_in["message"], conv_in["is_human"]):
        if is_human:
            conv_out.append({"role": "user", "content": msg})
        else:
            conv_out.append({"role": "assistant", "content": f"{bot_name}: {msg}"})
    # display(Markdown(f"### Sample {i}"))
    # for msg in conv_out:
    #     display(Markdown(f"**{msg['role']}**\n {msg['content']}"))
    all_convs_out.append(conv_out)

In [4]:
pl.DataFrame({"conversation": all_convs_out}).write_parquet("pippa_conversations.parquet")