## Dataset preparation

This section is pretty much just staging for preparing the synthetic dataset

In [1]:
from __future__ import annotations

import asyncio
import random
import re
import textwrap
from pathlib import Path
from typing import Any, Dict, List

import orjson
from tqdm import tqdm
import lmstudio as lms

from datasets import load_dataset
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


Since we don't have enough quality data for characters using Character card definitions, we will use the information contained there to create a synthetic dataset using an LLM to roleplay as the character to give expositional information via dialog.

In [2]:
CHAR_JSON_PATH = "/Users/aimeri/Downloads/Cricket.json"
MAX_LENGTH = 2048
MODEL_NAME = "HuggingFaceTB/SmolLM2-135M-Instruct"
DEVICE = "mps" # for Apple Silicon, for GPU nvidia usage use "cuda" or "cpu" for CPU usage

# --------------------------------------------------------------------------------------
# Template library for generating synthetic data
# Each entry is (mode, prompt-template) where fields in single curly braces {field} 
# will be filled with card attributes (name, description, etc.) or template variables
# (question, topic, user_prompt, etc.) using str.format().
# --------------------------------------------------------------------------------------
_TEMPLATES: List[tuple[str, str]] = [
    (
        "short_qa",
        textwrap.dedent(
            """
            You are {name}. Answer the question in first person and stay in character.
            Q: {question}
            A:""",
        ).strip(),
    ),
    (
        "narration",
        "Write one paragraph describing {name} entering a room from {name}'s perspective. Mention at least one physical trait in a subtle way.",
    ),
    (
        "monologue",
        "In two sentences let {name} reflect on {topic} while subtly referencing {fact}.",
    ),
    (
        "dialogue_turn",
        "User: {user_prompt}\n### {name}:",
    ),
    (
        "character_response",
        "{user_prompt}",
    ),
    (
        "internal_thought",
        "Write {name}'s internal thoughts about {situation} in first person.",
    ),
]

# Fallback lists for variables referenced above
_DEFAULT_QUESTIONS = [
    "What drives you?",
    "Describe your greatest fear.",
    "Why do you keep adventuring despite the risks?",
    "Do you believe people can change their fate?",
]
_DEFAULT_TOPICS = [
    "the nature of courage",
    "loneliness on the road",
    "the weight of leadership",
    "how the stars guide travellers",
]
_DEFAULT_USER_PROMPTS = [
    "Tell me about your homeland.",
    "How did you acquire your skills?",
    "What's your next goal?",
    "Do you trust the new companion?",
    "What's your biggest regret?",
    "How do you handle failure?",
    "Do you ever feel lust?",
    "What's your favorite food?",
    "What's your favorite drink?",
    "What's your favorite color?",
    "What's your favorite animal?",
    "What's your favorite book?",
    "What's your favorite activity?",
    "Have you ever been in love?",
    "Have you lost someone dear to you?",
]

_DEFAULT_SITUATIONS = [
    "facing an impossible challenge",
    "meeting an old enemy",
    "discovering a hidden truth",
    "making a difficult choice",
    "losing something important",
    "facing certain death",
    "experienceing pleasure",
]

In [35]:
# --------------------------------------------------------------------------------------
# LM Studio helper – runs the CLI and returns the raw string output.
# --------------------------------------------------------------------------------------
async def _lmstudio_chat(prompt: str, max_tokens: int = 160, temperature: float = 0.8, top_p: float = 0.9) -> str:
    """Call LM Studio CLI asynchronously and return the generated text."""

    model = lms.llm()

    # # Might need to add a system prompt to the model

    # # Create a chat with an initial system prompt.
    # chat = lms.Chat("You are a resident AI philosopher.")

    # # Build the chat context by adding messages of relevant types.
    # chat.add_user_message("What is the meaning of life?")
    # # result = model.respond(chat)

    return model.respond(prompt, config={
        "temperature": temperature,
        "topPSampling": top_p,
        "maxTokens": max_tokens,
    })

In [36]:
# --------------------------------------------------------------------------------------
# Card helpers
# --------------------------------------------------------------------------------------

_KEEP_FIELDS = {
    "name",
    "description",
    "personality",
    "mes_example",
    "scenario",
}


def _load_card(path: Path) -> Dict[str, str]:
    """Load a SillyTavern card (.json) and keep only whitelisted keys."""
    raw = orjson.loads(path.read_bytes())
    card = {k: v for k, v in raw.items() if k in _KEEP_FIELDS and isinstance(v, str)}
    # Normalise whitespace
    for k, v in card.items():
        card[k] = re.sub(r"\s+", " ", v).strip()
    return card


def _make_card_block(card: Dict[str, str]) -> str:
    """Return the canonical <CHAR_CARD> block used as system prompt."""
    lines = ["### <CHAR_CARD>"]
    lines.append(f"Name: {card.get('name', 'Unknown')}")
    if "species" in card:
        lines.append(f"Species: {card['species']}")
    if "age" in card:
        lines.append(f"Age: {card['age']}")
    if "gender" in card:
        lines.append(f"Gender: {card['gender']}")

    for field in ("description", "scenario", "personality", "first_person"):
        if field in card:
            pretty = card[field].replace("\n", " ")
            lines.append(f"{field.capitalize()}: {pretty}")

    lines.append("<|endofcard|>")
    return "\n".join(lines)

In [37]:
# --------------------------------------------------------------------------------------
# Synthetic sample generation
# --------------------------------------------------------------------------------------

from lmstudio import PredictionResult


def _fill_template(template: str, card: Dict[str, str]) -> str:
    """Replace placeholders inside a template string."""

    def _rand(lst: List[str]):
        return random.choice(lst)

    # Create a combined dictionary with card attributes and template variables
    format_dict = dict(card)  # Start with card attributes
    
    # Add template variables
    template_vars = {
        "question": _rand(_DEFAULT_QUESTIONS),
        "topic": _rand(_DEFAULT_TOPICS),
        "fact": card.get("description", "your past"),
        "user_prompt": _rand(_DEFAULT_USER_PROMPTS),
        "situation": _rand(_DEFAULT_SITUATIONS),
    }
    
    # Merge template variables into format_dict
    format_dict.update(template_vars)

    try:
        # Single-step formatting using str.format()
        templated = template.format(**format_dict)
        return templated.strip()
    except KeyError as e:
        print(f"[warn] Missing key in template formatting: {e}")
        # Fallback with minimal required fields
        try:
            minimal_dict = {"name": card.get("name", "Unknown")}
            minimal_dict.update(template_vars)
            return template.format(**minimal_dict).strip()
        except:
            return template.strip()


async def _generate_for_card(
    card_path: Path,
    samples_per_card: int,
    max_tokens: int,
    temperature: float,
    top_p: float,
) -> List[Dict[str, Any]]:
    """Generate *samples_per_card* examples for a single character card."""

    card = _load_card(card_path)
    card_block = _make_card_block(card)

    out_samples: List[Dict[str, Any]] = []

    # Synchronously generate samples
    for _ in tqdm(range(samples_per_card)):
        mode, template = random.choice(_TEMPLATES)
        prompt = _fill_template(template, card)
        
        # Validate that all placeholders were filled
        unfilled_placeholders = re.findall(r'\{[^}]+\}', prompt)
        if unfilled_placeholders:
            print(f"[warn] Skipping sample with unfilled placeholders: {unfilled_placeholders}")
            continue

        # Build final prompt: system card + user content (if any) – we expect the
        # model to answer as the assistant.
        full_prompt = f"{card_block}\n\n{prompt}"

        try:
            reply: PredictionResult = await _lmstudio_chat(
                prompt=full_prompt,
                max_tokens=max_tokens,
                temperature=temperature,
                top_p=top_p,
            )
            reply_content = reply.content

            # Quick hard filters
            if len(reply_content.split()) < 3:
                continue
            if len(reply_content.split()) > 420:
                continue

            # Build ChatML sample (system + user + assistant)
            sample = {
                "messages": [
                    {"role": "system", "content": card_block},
                    {"role": "user", "content": prompt},
                    {"role": "assistant", "content": reply_content},
                ]
            }
            out_samples.append(sample)

        except Exception as exc:
            print(f"[warn] generation failed: {exc}")
            continue

    return out_samples

In [38]:
# --------------------------------------------------------------------------------------
# File helpers
# --------------------------------------------------------------------------------------

def _write_sharded_jsonl(samples: List[Dict[str, Any]], out_dir: Path, shard_size: int = 2000):
    out_dir.mkdir(parents=True, exist_ok=True)

    shard_idx = 0
    for i in range(0, len(samples), shard_size):
        shard = samples[i : i + shard_size]
        shard_path = out_dir / f"synthetic_{shard_idx:03d}.jsonl"
        with shard_path.open("wb") as f:
            for ex in shard:
                f.write(orjson.dumps(ex))
                f.write(b"\n")
        shard_idx += 1

## Go time!

First we need to convert the Character card into a training set. Even the best writen cards would fall short of content and variety with enough repetition to avoid overfitting. So we take the information in them and use it to generate synthetic data based on the character.

In [39]:
all_samples: List[Dict[str, Any]] = []

# async def _driver():
card_path = Path(CHAR_JSON_PATH)
print(f"\n=== Generating for {card_path.name} ===")
samples = await _generate_for_card(
    card_path,
    samples_per_card=400,
    max_tokens=400,
    temperature=0.8,
    top_p=0.9,
)
all_samples.extend(samples)

# asyncio.run(_driver())

print(f"\nGenerated {len(all_samples)} samples. Writing shards …")
_write_sharded_jsonl(all_samples, Path("data/synthetic_dataset"))
print("Done.")


=== Generating for Cricket.json ===


100%|██████████| 400/400 [1:06:45<00:00, 10.01s/it]


Generated 400 samples. Writing shards …
Done.





Once we have our synthetic dataset we just need to massage it into a format that we can consume using the datasets library during finetuning.

In [12]:
# Utility functions for dataset preparation

# Mask user and system tokens
def mask_user_system_tokens(tokenizer, text: str):
    """Return (input_ids, labels) where labels for non-assistant tokens are ‑100."""
    # Tokenise full dialogue first
    input_ids = tokenizer(text, add_special_tokens=False)["input_ids"]
    labels = input_ids.copy()

    # Everything up to and including the *last* assistant tag is user/system context.
    assistant_tag = "<|im_start|>assistant"
    tag_index = text.rfind(assistant_tag)

    if tag_index != -1:
        # Compute number of tokens before assistant starts
        prefix_ids = tokenizer(text[: tag_index + len(assistant_tag)], add_special_tokens=False)[
            "input_ids"
        ]
        labels[: len(prefix_ids)] = [-100] * len(prefix_ids)
    else:
        # Fallback: mask nothing if we cannot find the tag
        labels = [-100] * len(labels)

    return input_ids, labels

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Process example
def process(example):
    if "messages" not in example or not example["messages"]:
        # Return None to filter out malformed examples
        return None

    messages = example["messages"]

    # Ensure each element has the keys we expect
    cleaned_messages = [
        {"role": m["role"], "content": m["content"]}
        for m in messages
        if "role" in m and "content" in m
    ]

    if not cleaned_messages:
        return None

    # Apply chat template
    chat_text = tokenizer.apply_chat_template(cleaned_messages, tokenize=False)
    
    # Tokenize the full conversation
    tokenized = tokenizer(
        chat_text, 
        add_special_tokens=False,
        truncation=True,
        max_length=MAX_LENGTH,
        return_tensors=None  # Return lists, not tensors
    )
    
    input_ids = tokenized["input_ids"]

    return {
        "input_ids": input_ids,
    }

In [13]:
ds = load_dataset("json", data_files="data/synthetic_dataset/synthetic_000.jsonl", split="train")

def process_and_filter(example):
    result = process(example)
    return result is not None


tokenised_ds = ds.map(process, remove_columns=ds.column_names, num_proc=4)
tokenised_ds = tokenised_ds.filter(lambda x: len(x["input_ids"]) > 0)  # Remove empty examples

# Save the dataset
Path("data/tokenized/cricket").mkdir(parents=True, exist_ok=True)
tokenised_ds.save_to_disk("data/tokenized/cricket")

Map (num_proc=4): 100%|██████████| 400/400 [00:00<00:00, 1426.30 examples/s]
Filter: 100%|██████████| 400/400 [00:00<00:00, 3111.61 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 400/400 [00:00<00:00, 94317.61 examples/s] 
