In [5]:
import sys
from datasets import load_dataset
import polars as pl
from IPython.display import Markdown
import tiktoken

sys.path.append("..")
from synthetic_data.tasks import _process_gutenberg_row
from datasets import Dataset
from synthetic_data.tasks import Output, SceneElementType

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
dataset = load_dataset(
    "sam-paech/gutenberg3-generalfiction-scifi-fantasy-romance-adventure-dpo",
    trust_remote_code=True,
)["train"]
dataset_pl: pl.DataFrame = dataset.to_polars()
tiktoken_encoder = tiktoken.get_encoding("o200k_base")

In [None]:
lengths = []
for i, row in enumerate(dataset_pl.head(10).iter_rows(named=True)):
    source = row["source"]
    display(Markdown(f"### Sample {i} - {source}"))
    row_processed = _process_gutenberg_row(row, tiktoken_encoder)
    display(
        Markdown(
            f"#### Original ({row_processed['encoded_length']} tokens) \n {row_processed['text']}"
        )
    )

In [2]:
formatted_pq = pl.read_parquet("../screenplay_scenes_summarized_full.parquet")

In [None]:
formatted_pq["prompt"]

In [None]:
for row in formatted_pq[1000:1010].iter_rows(named=True):
    output_obj = Output.model_validate_json(row["output"])
    for element in output_obj.items:
        fmt_str = f"{element.type.name} - {element.character}"
        if element.type == SceneElementType.DIALOGUE:
            fmt_str = f"**{element.character}**: {element.content}"
        elif element.type == SceneElementType.ACTION:
            fmt_str = f"*{element.content}*"
        else:
            fmt_str = f"{element.type.name} - {element.content}"
        display(Markdown(fmt_str))

In [3]:
formatted_pq_dataset = Dataset.from_polars(formatted_pq)

In [17]:
from openai.types.chat.chat_completion_message_param import ChatCompletionMessageParam


def _gutenberg_to_conversation(row: dict):
    conv: list[ChatCompletionMessageParam] = [
        {"role": "user", "content": row["prompt"]},
    ]
    output_obj = Output.model_validate_json(row["output"])
    formatted_screenplay = []
    for item in output_obj.items:
        if item.type == SceneElementType.DIALOGUE:
            formatted_screenplay.append(f"**{item.character}**: {item.content}")
        elif item.type == SceneElementType.ACTION:
            formatted_screenplay.append(f"*{item.content}*")
        else:
            formatted_screenplay.append(item.content)
    conv.append({"role": "assistant", "content": "\n".join(formatted_screenplay)})

    out = {"conversation": conv}
    for k in ("category", "prompt", "author", "title"):
        out[k] = row[k]
    return out


def _filter_gutenberg_row(row: dict):
    try:
        output_obj = Output.model_validate_json(row["output"])
    except Exception as e:
        print(e)
        return False
    if len(output_obj.items) < 5:
        return False
    n_dialogue, n_action = 0, 0
    for item in output_obj.items:
        if item.type == SceneElementType.DIALOGUE:
            n_dialogue += 1
        elif item.type == SceneElementType.ACTION:
            n_action += 1

    if n_action < 2 or n_dialogue < 2:
        return False

    return True


_gutenberg_to_conversation(formatted_pq[1000].to_dicts()[0])
formatted_pq_dataset.filter(_filter_gutenberg_row).map(
    _gutenberg_to_conversation, remove_columns=["chosen", "rejected", "source", "text"],
).to_pandas().to_parquet("../dataset_files/gutenberg_conversations.parquet")

Filter: 100%|██████████| 10256/10256 [00:00<00:00, 15693.16 examples/s]
Map: 100%|██████████| 9190/9190 [00:01<00:00, 5836.30 examples/s]
