# Standalone LLM Report Generation Demo (Autoregressive, Multi-Iteration, Scores in Docx)

This notebook demonstrates end-to-end autoregressive report generation, evaluation, self-reflection, and Word export using OpenAI and local prompt templates.

- Each subsection is generated in 5 autoregressive self-reflection steps.
- Each step uses the previous content and evaluation scores.
- Separate prompt templates are used for generation and evaluation (per subsection if available).
- The final docx includes all evaluation scores after each subsection.
- All logic is contained in this notebook.
- Prompt templates must be in `app/templates/`.
- You need an OpenAI API key.

In [None]:
import os
import openai
import re
from docx import Document
from typing import List, Dict
from IPython.display import display, Markdown

# Set your OpenAI API key here
openai.api_key = 'YOUR_OPENAI_KEY'  # <-- Replace with your key or use os.environ['OPENAI_API_KEY']
TEMPLATE_DIR = 'app/templates'

## 1. Define Report Structure and Group

In [None]:
sections = {
    'Executive Summary': ['Overview', 'Key Points'],
    'Business Goals': ['Objectives', 'KPIs'],
    'Model Data': ['Data Sources', 'Data Quality'],
    'Model Selection': ['Candidate Models', 'Selection Criteria'],
    'Model Performance': ['Metrics', 'Validation Results'],
    'Model Testing': ['Test Strategy', 'Test Results'],
    'Model Monitoring': ['Monitoring Plan', 'Alerting & Retraining']
}
group_id = 'general'  # or 'credit', 'capital', etc.
title = 'Standalone LLM Validation Report'

## 2. Helper Functions: Prompt Template Loading, OpenAI Calls, Evaluation, Self-Reflection, Docx Export

In [None]:
def load_group_template(doc_type: str, group_id: str, section: str = None, subsection: str = None, eval_mode=False) -> str:
    # If eval_mode, look for _eval_prompt_template.txt
    suffix = '_eval_prompt_template.txt' if eval_mode else '_prompt_template.txt'
    candidates = []
    if section and subsection:
        candidates.append(f'{TEMPLATE_DIR}/{group_id}_{section}_{subsection}{suffix}')
    if section:
        candidates.append(f'{TEMPLATE_DIR}/{group_id}_{section}{suffix}')
    candidates.append(f'{TEMPLATE_DIR}/{group_id}{suffix}')
    candidates.append(f'{TEMPLATE_DIR}/{doc_type}{suffix}')
    for path in candidates:
        if os.path.exists(path):
            with open(path, 'r') as f:
                return f.read()
    raise FileNotFoundError(f'No prompt template found for {group_id}, {section}, {subsection}, {doc_type}, eval_mode={eval_mode}')

def call_openai(prompt: str, system_prompt: str = None) -> str:
    messages = []
    if system_prompt:
        messages.append({"role": "system", "content": system_prompt})
    messages.append({"role": "user", "content": prompt})
    response = openai.ChatCompletion.create(model='gpt-4', messages=messages)
    return response['choices'][0]['message']['content']

def parse_system_user_prompt(template: str):
    sys, user = '', template
    if '---SYSTEM PROMPT---' in template and '---USER PROMPT---' in template:
        sys = template.split('---SYSTEM PROMPT---')[1].split('---USER PROMPT---')[0].strip()
        user = template.split('---USER PROMPT---')[1].strip()
    return sys, user

def autoregressive_self_reflect(section, sub, context, group_id, doc_type='report', n_iter=5):
    # Initial generation
    template = load_group_template(doc_type, group_id, section, sub)
    sys_prompt, user_prompt = parse_system_user_prompt(template)
    prompt = user_prompt.replace('{{section}}', section).replace('{{subsection}}', sub).replace('{{context}}', context)
    prompt = prompt.replace('{{previous_generation}}', '')
    prompt = prompt.replace('{{groundedness_score}}', '0.0').replace('{{completeness_score}}', '0.0').replace('{{coherence_score}}', '0.0')
    gen_text = call_openai(prompt, sys_prompt)
    all_texts = [gen_text]
    all_scores = []
    for i in range(n_iter):
        # Evaluation
        eval_template = load_group_template(doc_type, group_id, section, sub, eval_mode=True)
        eval_sys, eval_user = parse_system_user_prompt(eval_template)
        eval_prompt = eval_user.replace('{{section}}', section).replace('{{subsection}}', sub).replace('{{context}}', context).replace('{{generated_text}}', all_texts[-1])
        eval_response = call_openai(eval_prompt, eval_sys)
        # Parse scores
        def extract_first_float(line):
            match = re.search(r'[-+]?[0-9]*\.?[0-9]+', line)
            return float(match.group()) if match else 0.0
        lines = eval_response.split('\n')
        scores = []
        for line in lines:
            if len(scores) >= 3:
                break
            scores.append(extract_first_float(line))
        while len(scores) < 3:
            scores.append(0.0)
        eval_scores = {
            'groundedness': scores[0],
            'completeness': scores[1],
            'coherence': scores[2]
        }
        all_scores.append(eval_scores)
        # Self-reflection
        template = load_group_template(doc_type, group_id, section, sub)
        sys_prompt, user_prompt = parse_system_user_prompt(template)
        prompt = user_prompt.replace('{{section}}', section).replace('{{subsection}}', sub).replace('{{context}}', context)
        prompt = prompt.replace('{{previous_generation}}', all_texts[-1])
        prompt = prompt.replace('{{groundedness_score}}', str(eval_scores['groundedness']))
        prompt = prompt.replace('{{completeness_score}}', str(eval_scores['completeness']))
        prompt = prompt.replace('{{coherence_score}}', str(eval_scores['coherence']))
        improved_text = call_openai(prompt, sys_prompt)
        all_texts.append(improved_text)
    return all_texts[-1], all_scores

def export_to_docx(title, sections_output):
    doc = Document()
    doc.add_heading(title, 0)
    current_section = None
    for so in sections_output:
        if so['section'] != current_section:
            doc.add_heading(so['section'], level=1)
            current_section = so['section']
        doc.add_heading(so['subsection'], level=2)
        doc.add_paragraph(so['generated_text'])
        # Add evaluation scores
        scores = so['scores'][-1] if so['scores'] else {'groundedness':0, 'completeness':0, 'coherence':0}
        doc.add_paragraph(f"Scores: Groundedness={scores['groundedness']}, Completeness={scores['completeness']}, Coherence={scores['coherence']}", style='Intense Quote')
    doc.save('standalone_generated_report.docx')
    print('Saved as standalone_generated_report.docx')


## 3. Autoregressive Generation, Evaluation, and Self-Reflection (5 Iterations)

In [None]:
sections_output = []
context = ''
for section, subs in sections.items():
    for sub in subs:
        final_text, all_scores = autoregressive_self_reflect(section, sub, context, group_id, doc_type='report', n_iter=5)
        sections_output.append({
            'section': section,
            'subsection': sub,
            'generated_text': final_text,
            'scores': all_scores
        })
        context += final_text + '
'
        display(Markdown(f'### {section} / {sub}'))
        display(Markdown(final_text))
        if all_scores:
            last = all_scores[-1]
            display(Markdown(f"*Groundedness: {last['groundedness']} | Completeness: {last['completeness']} | Coherence: {last['coherence']}*"))

## 4. Export the Final Report as a Word Document (.docx) (with Scores)

In [None]:
export_to_docx(title, sections_output)
# Optionally, display the docx content
from docx import Document as DocxDocument
doc = DocxDocument('standalone_generated_report.docx')
for para in doc.paragraphs:
    print(para.text)