# Pipeline

This is meant to be a more pipelined version of the `bullet_journal_llm` notebook experiment. This will
do bulk extraction of notes from the notebooks, with different prompts based on the notebook.

In [1]:
from pathlib import Path
import logging
import sys
import yaml

import google.generativeai as genai
import os

_LOGGER = logging.getLogger(__name__)
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)

SECRETS = Path("../secrets.yaml")
MODEL_ID = 'gemini-1.5-flash'

secrets = yaml.safe_load(SECRETS.read_text())
genai.configure(api_key=secrets["gemini_api_key"])

_LOGGER.info("Initializing client library")
model = genai.GenerativeModel(MODEL_ID)

INFO:__main__:Initializing client library


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
NOTES_DIR = Path("../notes")
PAGES_DIR = Path("../pages")
DYNAMIC_PROMPTS_DIR = Path("dynamic_prompts")

In [3]:
from model import DynamicPrompt

dynamic_prompts = {}
for filename in DYNAMIC_PROMPTS_DIR.glob("*.yaml"):
    _LOGGER.info(f"Loading dynamic prompt: {filename}")
    dynamic_prompts[filename.stem] = DynamicPrompt.from_file(filename)


DEFAULT = [
    "default",
    "rapid_log_legend",
    "profile",
]
FILE_PREFIX_PROMPT_MAP = {
    "Daily": [
        *DEFAULT,
        "daily",
    ],
    "Weekly": [
        *DEFAULT,
        "weekly",
    ],
    "Monthly": [
        *DEFAULT,
        "monthly",
    ]
}


def get_dynamic_prompts(page_filename: Path) -> list[DynamicPrompt]:
    """Get a set of prompts that match the given prefix"""

    page_name = page_filename.stem
    prefix = page_name.split("-")[0]
    prompt_names = FILE_PREFIX_PROMPT_MAP.get(prefix, DEFAULT)

    prompts = []
    for prompt_prefix in prompt_names:
        for key, value in dynamic_prompts.items():
            if key.startswith(prompt_prefix):
                prompts.append(value)
    return prompts


INFO:__main__:Loading dynamic prompt: dynamic_prompts/daily.yaml
INFO:__main__:Loading dynamic prompt: dynamic_prompts/monthly-01.yaml
INFO:__main__:Loading dynamic prompt: dynamic_prompts/rapid_log_legend.yaml
INFO:__main__:Loading dynamic prompt: dynamic_prompts/weekly-02.yaml
INFO:__main__:Loading dynamic prompt: dynamic_prompts/weekly-01.yaml
INFO:__main__:Loading dynamic prompt: dynamic_prompts/monthly-02.yaml
INFO:__main__:Loading dynamic prompt: dynamic_prompts/profile.yaml
INFO:__main__:Loading dynamic prompt: dynamic_prompts/default.yaml
INFO:__main__:Loading dynamic prompt: dynamic_prompts/monthly-03.yaml


In [4]:
import random
import datetime
import re

pages = list(PAGES_DIR.glob("*.png"))
# pages_sample = random.sample(pages, 5)

In [5]:
from typing import Any

EXTRACT_JSON = re.compile("```json\n(.*?)\n```", re.DOTALL)


def parse_model_response(response_text: str) -> str:
    """Parse the response from the model and return a yaml string."""

    if response_text.startswith("```json"):
        text = EXTRACT_JSON.match(response_text).group(1)
    else:
        text = response_text

    try:
        obj = json.loads(text)
    except ValueError as err:
        _LOGGER.error(f"Error processing: %s", err)
        return text
    
    for k, v in list(obj.items()):
        if v is None or v == "null":
            del obj[k]

    return yaml.dump(obj, explicit_start=True, sort_keys=False)


In [6]:
import PIL.Image
import json
from tqdm import tqdm

DOCS = Path("../docs")

TIMESTAMP_RE = re.compile(".*?-\d+-P(\d{20}).*.png")

FILE_PROMPT = """
Please answer in json with no other formatting since the answer will be parsed programmatically.

Filename: {filename}
Created At: {created_at}
Content:
"""

for page in tqdm(pages):
    re_match = TIMESTAMP_RE.match(str(page))
    created_at = datetime.datetime.strptime(re_match.group(1), "%Y%m%d%H%M%S%f")

    prompts = get_dynamic_prompts(page)
    
    output_filename = DOCS / f"{page.stem}.yaml"
    if output_filename.exists():
        continue

    _LOGGER.info(f"Generating {output_filename}")
    img = PIL.Image.open(str(page))
    prompt = "\n\n".join([p.as_prompt() for p in prompts])

    response = model.generate_content([prompt, FILE_PROMPT.format(filename=f"{page.stem}.png", created_at=created_at.isoformat()), img])

    output = parse_model_response(response.text)

    with open(output_filename, "w") as f:
        f.write(output)


  0%|          | 0/157 [00:00<?, ?it/s]

100%|██████████| 157/157 [00:00<00:00, 30158.27it/s]


In [8]:
from pathlib import Path
from model import JournalPage
from mashumaro.exceptions import MissingField

DOCS = Path("../docs")

for doc_filename in DOCS.glob("*.yaml"):   
    text = doc_filename.read_text()
    try:
        JournalPage.from_yaml(text)
    except MissingField as err:
        _LOGGER.error(f"Error processing {doc_filename}: {err}")
        continue

In [11]:
JournalPage.from_yaml((DOCS / "Future Log-01-P20231219210105181293BAOKeqohWGBJ.yaml").read_text())

JournalPage(filename='Future Log-01-P20231219210105181293BAOKeqohWGBJ.png', created_at='2023-12-19T21:01:05.181293', label=None, date=None, content=[{'month': 'APR', 'tasks': [{'type': '>', 'description': 'use marina trip?'}, {'type': '>', 'description': 'yard house $'}, {'type': 'X', 'description': 'Tech Impact Award : 10 before 5/12'}]}, {'month': 'MAY', 'tasks': [{'type': '>', 'description': 'Patio leak review / 1km'}, {'type': '>', 'description': 'use marina trip'}, {'type': '>', 'description': 'yard house $'}, {'type': '•', 'description': 'QQ bytes solved (accuracy)'}, {'type': 'o', 'description': '0509 Teacher PID'}, {'type': 'o', 'description': '0503 - 0505 YD Trip'}, {'type': '•', 'description': 'ETP 0506 - 0521'}]}, {'month': 'JUN', 'tasks': [{'type': 'X', 'description': 'QQ bytes solved (Art)'}, {'type': '•', 'description': 'Patio leak review'}, {'type': '>', 'description': 'marina trip'}, {'type': '>', 'description': 'yard house $'}, {'type': '>', 'description': 'sensitive d