In [None]:
import pydantic
import sys
from pydantic import BaseModel
from datasets import load_dataset
import asyncio
import polars as pl
from IPython.display import Markdown


sys.path.append("..")
from synthetic_data.generation import RemoteModel, get_generation_wrapper

%load_ext autoreload
%autoreload 2
from trl_wrapper.trainer_wrapper import TrainerWrapper, PLAYWRIGHT_CONFIG, SMOL_LM_135M

dataset = load_dataset("sam-paech/gutenberg3-generalfiction-scifi-fantasy-romance-adventure-dpo", trust_remote_code=True)[
    "train"
]
dataset_pl: pl.DataFrame = dataset.to_polars()


In [None]:
dataset_pl.columns

In [12]:
for i, row in enumerate(dataset_pl.head(10).iter_rows(named=True)):
    source = row['source']
    display(Markdown(f"### Sample {i} - {source}"))
    # display(Markdown(row['chosen'][-1]['content']))

### Sample 0 - [GENERAL FICTION] Virginia_Woolf -- Night_and_Day

### Sample 1 - [ADVENTURE] Rafael Sabatini -- Scaramouche: A Romance of the French Revolution

### Sample 2 - [ADVENTURE] Talbot Mundy -- King--of the Khyber Rifles: A Romance of Adventure

### Sample 3 - [FANTASY] Howard Pyle -- Otto of the Silver Hand

### Sample 4 - [ADVENTURE] Rudyard Kipling -- "Captains Courageous": A Story of the Grand Banks

### Sample 5 - [FANTASY] Ernest Bramah -- The Wallet of Kai Lung

### Sample 6 - [ROMANCE] E_Werner -- The_Northern_Light_Translated_by_Mrs_D_M_Lowrey

### Sample 7 - [GENERAL FICTION] Herman_Melville -- Bartleby_The_Scrivener

### Sample 8 - [GENERAL FICTION] Elizabeth_Gaskell -- A_Dark_Night's_Work

### Sample 9 - [SCIFI] Mary E. Bradley Lane -- Mizora: A Prophecy

In [None]:
import json
from typing import Sequence
from openai.types.chat.chat_completion_message_param import ChatCompletionMessageParam
from synthetic_data.screenplay_parser import Scene
from IPython.display import display, Markdown

def format_conversation_oai(sample: dict) -> Sequence[ChatCompletionMessageParam]:
    # TODO handle interpolation and character intros
    conversation, bot_name = sample["conversation"], sample["bot_name"]
    out_conv: Sequence[dict] = []
    for msg, is_human in zip(conversation["message"], conversation["is_human"]):
        out_conv.append({"role": bot_name if not is_human else "User", "content": msg})

    return [
        {
            "role": "system",
            "content": "Extract the dialogue, actions, and descriptions from the conversation given by the user.",
        },
        {"role": "user", "content": json.dumps(out_conv)},
    ]

def print_oai_conversation(sample: list[dict]):
    conversation, bot_name = sample["conversation"], sample["bot_name"]
    out_conv: Sequence[dict] = []
    for msg, is_human in zip(conversation["message"], conversation["is_human"]):
        out_conv.append({"role": bot_name if not is_human else "User", "content": msg})
    display(Markdown(f"#### Conversation: {sample['bot_name']}"))
    for msg in out_conv:
        role, content = msg["role"], msg["content"]
        display(Markdown(f"**{role}**: {content}"))

for row in dataset_pl[500:510].iter_rows(named=True):
    print_oai_conversation(row)

In [None]:
from synthetic_data.generation import GenWrapperArgs
from typing import List
from enum import Enum


class SceneElementType(Enum):
    SCENE_HEADING = "scene_heading"
    ACTION = "action"
    DIALOGUE = "dialogue"
    TRANSITION = "transition"


class SceneElement(BaseModel):
    type: SceneElementType
    content: str
    character: str | None = None


class Output(BaseModel):
    items: List[SceneElement]


model: str = RemoteModel.GPT_4O_MINI.value
generation_wrapper = get_generation_wrapper(
    model, GenWrapperArgs(model_id="gpt-4o-mini", response_format=Output)
)  # type: ignore

sample = format_conversation_oai(dataset[5])

completions = await generation_wrapper.generate([sample])
print(completions)

In [None]:
out_json = json.loads(completions[0])
out = Output.model_validate(out_json)
out.items