In [1]:
#

In [2]:
import pandas as pd
import regex as re
import os
from pathlib import Path
import json

In [3]:
with open('./PROMPT_TEMPLATE.md', encoding='utf-8') as f:
    prompt_template = f.read()

In [4]:
list_docs = []
for root, dirs, files in os.walk('./DOCS/'):
    for file in files:
        if file.endswith('.md'):
            list_docs.append(os.path.join(root, file).replace(os.sep, '/'))

list_placeholders = re.findall(r'\[.*?_PLACEHOLDER.*?\]', prompt_template)

In [5]:
# Automatically map placeholders to document paths with assertions
dict_docs_placeholders = {}

for placeholder in list_placeholders:
    # Skip CHAPTER_JSON_PLACEHOLDER (not a document)
    if 'CHAPTER_JSON' in placeholder:
        continue
    
    # Extract the filename from placeholder: [DOC_MASTER_SUMMARY_PLACEHOLDER] -> DOC_MASTER_SUMMARY
    filename = placeholder.strip('[]').replace('_PLACEHOLDER', '')
    
    # Search for matching file in list_docs
    matched = False
    for doc_path in list_docs:
        if filename in Path(doc_path).stem:
            dict_docs_placeholders[placeholder] = doc_path
            matched = True
            break
    
    # Assert that we found a match
    assert matched, f"No document found for placeholder: {placeholder} (looking for: {filename})"

# Assert we have exactly 7 document mappings (excluding CHAPTER_JSON_PLACEHOLDER)
assert len(dict_docs_placeholders) == 7, f"Expected 7 documents, found {len(dict_docs_placeholders)}"

# Assert all mapped files exist
for placeholder, doc_path in dict_docs_placeholders.items():
    assert os.path.exists(doc_path), f"File not found: {doc_path} for {placeholder}"

print("✓ All assertions passed!")
print(f"✓ Mapped {len(dict_docs_placeholders)} documents successfully")
dict_docs_placeholders

✓ All assertions passed!
✓ Mapped 7 documents successfully


{'[DOC_MASTER_SUMMARY_PLACEHOLDER]': './DOCS/DOC_MASTER_SUMMARY_PLACEHOLDER.md',
 '[DOC_SESSION_7_PLACEHOLDER]': './DOCS/DOC_SESSION_7_PLACEHOLDER.md',
 '[DOC_SESSION_9_PLACEHOLDER]': './DOCS/DOC_SESSION_9_PLACEHOLDER.md',
 '[DOC_INCEPTION_PLACEHOLDER]': './DOCS/DOC_INCEPTION_PLACEHOLDER.md',
 '[DOC_SWOT_PLACEHOLDER]': './DOCS/DOC_SWOT_PLACEHOLDER.md',
 '[DOC_FEEDBACK_PLACEHOLDER]': './DOCS/DOC_FEEDBACK_PLACEHOLDER.md',
 '[DOC_GITLOG_PLACEHOLDER]': './DOCS/DOC_GITLOG_PLACEHOLDER.md'}

In [6]:
def create_prompt(part_json_data, prompt_template, dict_docs_placeholders):
    """
    Create a complete prompt by injecting documents and chapter JSON into the template.
    
    Args:
        part_json_data: Dict containing part details (from toc.json)
        prompt_template: String containing the prompt template with placeholders
        dict_docs_placeholders: Dict mapping placeholders to document file paths
    
    Returns:
        String with all placeholders replaced
    """
    # Start with the template
    prompt = prompt_template
    
    # Replace all document placeholders with actual file contents
    for placeholder, doc_path in dict_docs_placeholders.items():
        with open(doc_path, 'r', encoding='utf-8') as f:
            doc_content = f.read()
        prompt = prompt.replace(placeholder, doc_content)
    
    # Replace CHAPTER_JSON_PLACEHOLDER with the JSON data
    chapter_json_str = json.dumps(part_json_data, indent=2)
    prompt = prompt.replace('[CHAPTER_JSON_PLACEHOLDER]', chapter_json_str)
    
    # Assertions to ensure all placeholders were replaced
    remaining_placeholders = re.findall(r'\[.*?_PLACEHOLDER.*?\]', prompt)
    assert len(remaining_placeholders) == 0, f"Unreplaced placeholders found: {remaining_placeholders}"
    
    return prompt

In [7]:
with open('./toc.json', encoding='utf-8') as f:
    data = json.load(f)

In [8]:
df = pd.DataFrame(data['field_manual_structure'])

In [9]:
df = df.rename(columns={'parts': 'JSON_PART'})
df['PROMPT'] = df['JSON_PART'].apply(lambda x: create_prompt(x, prompt_template=prompt_template, dict_docs_placeholders=dict_docs_placeholders))

In [10]:
df['PROMPT_ID'] = df.index + 1 
df['PROMPT_ID'] = df['PROMPT_ID'].apply(lambda x: str(x).zfill(3))
df

Unnamed: 0,JSON_PART,PROMPT,PROMPT_ID
0,"{'part_number': 1, 'title': 'Genesis', 'subtit...",# PROJECT DOCUMENTATION\n\nYou have access to ...,1
1,"{'part_number': 2, 'title': 'The Design Sprint...",# PROJECT DOCUMENTATION\n\nYou have access to ...,2
2,"{'part_number': 3, 'title': 'The First Disaste...",# PROJECT DOCUMENTATION\n\nYou have access to ...,3
3,"{'part_number': 4, 'title': 'The Cooling Perio...",# PROJECT DOCUMENTATION\n\nYou have access to ...,4
4,"{'part_number': 5, 'title': 'The Debugging Inn...",# PROJECT DOCUMENTATION\n\nYou have access to ...,5
5,"{'part_number': 6, 'title': 'The Quality Sprin...",# PROJECT DOCUMENTATION\n\nYou have access to ...,6
6,"{'part_number': 7, 'title': 'The Overcorrectio...",# PROJECT DOCUMENTATION\n\nYou have access to ...,7
7,"{'part_number': 8, 'title': 'The Token Economi...",# PROJECT DOCUMENTATION\n\nYou have access to ...,8
8,"{'part_number': 9, 'title': 'The Classificatio...",# PROJECT DOCUMENTATION\n\nYou have access to ...,9
9,"{'part_number': 10, 'title': 'The Ergonomics C...",# PROJECT DOCUMENTATION\n\nYou have access to ...,10


In [11]:
df.to_pickle('./INPUT_PROMPTS.pkl', protocol=4)