Change scope of the notebook

In [None]:
import os
from pathlib import Path

scope = Path().resolve()
print(scope)

os.chdir("..")

scope = Path().resolve()
print(scope)

Pre requisites

In [None]:
import json
import re
from datetime import datetime

import pandas as pd
from dotenv import load_dotenv

from utils.llm import llm

load_dotenv(".env")

model = llm()

# prompt directly from kaggle starter notebook
svg_prompt = """Generate SVG code to visually represent the following text description, while respecting the given constraints.
<constraints>
* **Allowed Elements:** `svg`, `path`, `circle`, `rect`, `ellipse`, `line`, `polyline`, `polygon`, `g`, `linearGradient`, `radialGradient`, `stop`, `defs`
* **Allowed Attributes:** `viewBox`, `width`, `height`, `fill`, `stroke`, `stroke-width`, `d`, `cx`, `cy`, `r`, `x`, `y`, `rx`, `ry`, `x1`, `y1`, `x2`, `y2`, `points`, `transform`, `opacity`
</constraints>

<example>
<description>"A red circle with a blue square inside"</description>
```svg
<svg viewBox="0 0 256 256" width="256" height="256">
  <circle cx="50" cy="50" r="40" fill="red"/>
  <rect x="30" y="30" width="40" height="40" fill="blue"/>
</svg>
```
</example>


Please ensure that the generated SVG code is well-formed, valid, and strictly adheres to these constraints. Focus on a clear and concise representation of the input description within the given limitations. Always give the complete SVG code with nothing omitted. Never use an ellipsis.

<description>"{}"</description>
```svg
<svg viewBox="0 0 256 256" width="256" height="256">
"""

synthetic_desc_prompt = """Your task is to generate {limit} unique descriptions for SVG.
<example>
{examples}
</example>
Important: you cannot use examples directly and output only in json.

``json
"""

utility functions

In [34]:
def postprocess_response(content: str):
    """
    Remove markdown delimiters
    """
    if "```json" in content:
        regex = r"```json(.*?)```"
        matches = re.findall(regex, content, re.DOTALL)
        return matches[0]
    else:
        return content


def get_descriptions() -> list[str]:
    with open("dataset/saved_descriptions.json", "r") as file:
        data = json.load(file)

    return data


def save_descriptions(new_desc: list[str]):
    """
    save unique processed descriptions
    """
    data = get_descriptions()
    data = set(data)
    data.update(new_desc)
    with open("dataset/saved_descriptions.json", "w") as file:
        json.dump(list(data), file)

    return


def list_to_str(list_desc: list[str]):
    formatted_disc = ""
    for desc in list_desc:
        formatted_disc += desc + "\n"
    return formatted_disc


generate new descriptions

Not the best way to do synthetic description generation but this method should be applicable for the first few 100s of samples

In [None]:
def generate_new_descriptions():
    descriptons_string = list_to_str(get_descriptions())

    new_desc = model(
        synthetic_desc_prompt.format(limit=str(50), examples=descriptons_string)
    )

    try:
        if new_desc:
            data = json.loads(postprocess_response(new_desc))
        else:
            raise ValueError("rate limit reached")
    except Exception:
        raise ValueError("not json serializable")

    return data


train_data = generate_new_descriptions()

generate new SVGs

In [None]:
def generate(list_desc):
    output_file = f"dataset/svg_datasets/svg_gemini-2.5-pro_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.csv"
    df_dict = {
        "description": [],
        "prompt": [],
        "svg": [],
        # "processed_svg": [], Can be done later
    }
    for desc in list_desc:
        # append None when rate limit exceeded or uncaught errors
        try:
            svg = model(svg_prompt.format(desc))
        except Exception:
            svg = None

        df_dict["description"].append(desc)
        df_dict["prompt"].append(svg_prompt.format(desc))
        df_dict["svg"].append(svg)

    df = pd.DataFrame(df_dict)
    df = df.dropna()
    save_descriptions(list(df["description"]))
    df.to_csv(output_file)


generate(train_data)