In [None]:
import os
from openai import AzureOpenAI
from azure.identity import DefaultAzureCredential, get_bearer_token_provider

def generate_rca_html(transcript_md: str, formatting_template: str) -> str:
    """
    Generate an RCA HTML document from transcript markdown and formatting template.
    
    Args:
        transcript_md (str): The markdown content containing the RCA transcript
        formatting_template (str): The HTML/CSS formatting template to apply
        
    Returns:
        str: The generated HTML content as a string
    """
    
    endpoint = os.getenv("ENDPOINT_URL", "https://aifoundryarc.openai.azure.com/")
    deployment = os.getenv("DEPLOYMENT_NAME", "gpt-5-mini")
    
    # Initialize Azure OpenAI client with Entra ID authentication
    token_provider = get_bearer_token_provider(
        DefaultAzureCredential(),
        "https://cognitiveservices.azure.com/.default"
    )
    
    client = AzureOpenAI(
        azure_endpoint=endpoint,
        azure_ad_token_provider=token_provider,
        api_version="2025-01-01-preview",
    )
    
    # Build the chat prompt with the provided inputs
    chat_prompt = [
        {
            "role": "developer",
            "content": [
                {
                    "type": "text",
                    "text": "You are tasked with creating a final Root Cause Analysis (RCA) document in HTML format by transforming the provided markdown content (`{{transcript_md}}`) and applying the given formatting template (`{{formatting_template}}`).\n\n# Steps\n\n1. Parse the provided markdown content `{{transcript_md}}` into structured HTML.\n2. Use the `{{formatting_template}}` to style and format the HTML appropriately. Ensure the final document is consistent with the provided template's style.\n3. Ensure proper HTML tags, formatting, and structure (e.g., headings, paragraphs, lists, tables, etc.) to maintain clarity and readability.\n4. Only extract and include relevant information in the RCA document. Redundant or irrelevant content should be excluded.\n5. Validate the final document as a well-formed HTML structure.\n\n# Output Format\n\n- Provide the completed RCA document as a valid HTML string.\n- Ensure that the HTML output includes all applied styles and formatting from `{{formatting_template}}`.\n- The output should NOT include code blocks (```) around the HTML.\n\n# Notes\n\n- Ensure the readability of the produced HTML and maintain compatibility with diverse browsers.\n- Any headings or sections missing in `{{transcript_md}}` but integral to an RCA (e.g., summary, impact, corrective action) should be inferred logically and clearly defined.\ninclude css information as part of the html\n"
                }
            ]
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": f"Transcript content:\n{transcript_md}\n\nFormatting template:\n{formatting_template}\n\nCreate the final RCA HTML document."
                }
            ]
        }
    ]
    
    # Call the API
    completion = client.chat.completions.create(
        model=deployment,
        messages=chat_prompt,
        max_completion_tokens=16384,
        stop=None,
        stream=False
    )
    
    # Extract and return the HTML content
    return completion.choices[0].message.content

In [None]:
# Fix for the extract_formatting_template function - add missing client initialization
def extract_formatting_template_fixed(html_content: str) -> str:
    """
    Extract formatting template (HTML structure and CSS) from existing HTML content.
    
    Args:
        html_content (str): The HTML content to analyze and extract formatting from
        
    Returns:
        str: The extracted formatting template as a string
    """
    
    endpoint = os.getenv("ENDPOINT_URL", "https://aifoundryarc.openai.azure.com/")
    deployment = os.getenv("DEPLOYMENT_NAME", "gpt-5-mini")
    
    # Initialize Azure OpenAI client with Entra ID authentication
    token_provider = get_bearer_token_provider(
        DefaultAzureCredential(),
        "https://cognitiveservices.azure.com/.default"
    )
    
    client = AzureOpenAI(
        azure_endpoint=endpoint,
        azure_ad_token_provider=token_provider,
        api_version="2025-01-01-preview",
    )
    
    # Build the chat prompt to extract formatting
    chat_prompt = [
        {
            "role": "developer",
            "content": [
                {
                    "type": "text",
                    "text": "You are tasked with analyzing an HTML document and extracting its formatting template. The template should capture the document's structure, CSS styles, and layout patterns while removing specific content.\n\n# Steps\n\n1. Parse the provided HTML content to identify its structure (headings, sections, tables, lists, etc.).\n2. Extract all CSS styling information (inline styles, <style> blocks, class definitions).\n3. Create a formatting template that preserves the HTML structure and all styling while replacing actual content with placeholder text or empty structures.\n4. Ensure the template maintains all CSS classes, table structures, heading hierarchies, and layout patterns.\n5. Document key formatting elements (fonts, colors, spacing, borders) in the template.\n\n# Output Format\n\n- Provide the formatting template as valid HTML with embedded CSS.\n- Replace specific content with descriptive placeholders (e.g., {{section_title}}, {{table_data}}).\n- Include comments to explain major sections and styling choices.\n- The output should NOT include code blocks (```) around the HTML.\n\n# Notes\n\n- Preserve all structural elements even if they appear empty.\n- Maintain the exact CSS styling and class names.\n- Ensure the template can be reused with different content while maintaining the same visual appearance.\n"
                }
            ]
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": f"HTML content to analyze:\n{html_content}\n\nExtract the formatting template from this HTML document."
                }
            ]
        }
    ]
    
    # Call the API
    completion = client.chat.completions.create(
        model=deployment,
        messages=chat_prompt,
        max_completion_tokens=16384,
        stop=None,
        stream=False
    )
    
    # Extract and return the formatting template
    return completion.choices[0].message.content

# Install required packages for the workflow
def install_required_packages():
    """
    Install required packages for the RCA workflow.
    """
    import subprocess
    import sys
    
    required_packages = [
        "python-docx",
        "beautifulsoup4", 
        "lxml",
        "openai",
        "azure-identity"
    ]
    
    print("📦 Installing required packages...")
    for package in required_packages:
        try:
            subprocess.check_call([sys.executable, "-m", "pip", "install", package])
            print(f"✅ {package} installed successfully")
        except subprocess.CalledProcessError:
            print(f"❌ Failed to install {package}")

print("🔧 Additional utilities loaded!")
print("💡 Run install_required_packages() if you need to install dependencies")
print("🛠️ Use extract_formatting_template_fixed() instead of extract_formatting_template()")

In [None]:
# HTML to DOCX Converter Integration
import sys
import os
from datetime import datetime

# Add path to import our HTML to DOCX converter
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), '..'))

try:
    from docx import Document
    from docx.shared import Pt
    from bs4 import BeautifulSoup
    print("✅ HTML to DOCX converter dependencies loaded successfully!")
except ImportError as e:
    print(f"❌ Missing package for DOCX conversion: {e}")
    print("Install with: pip install python-docx beautifulsoup4 lxml")

def read_template_file(template_path: str) -> str:
    """
    Read formatting template from a file.
    
    Args:
        template_path (str): Path to the template file
        
    Returns:
        str: The template content as a string
    """
    try:
        with open(template_path, 'r', encoding='utf-8') as file:
            template_content = file.read()
        print(f"✅ Template loaded from: {template_path}")
        return template_content
    except FileNotFoundError:
        print(f"❌ Template file not found: {template_path}")
        return ""
    except Exception as e:
        print(f"❌ Error reading template: {e}")
        return ""

def read_markdown_file(markdown_path: str) -> str:
    """
    Read markdown content from a file.
    
    Args:
        markdown_path (str): Path to the markdown file
        
    Returns:
        str: The markdown content as a string
    """
    try:
        with open(markdown_path, 'r', encoding='utf-8') as file:
            markdown_content = file.read()
        print(f"✅ Markdown loaded from: {markdown_path}")
        return markdown_content
    except FileNotFoundError:
        print(f"❌ Markdown file not found: {markdown_path}")
        return ""
    except Exception as e:
        print(f"❌ Error reading markdown: {e}")
        return ""

def convert_html_to_docx(html_content: str, output_filename: str = None) -> str:
    """
    Convert HTML content to DOCX format
    
    Args:
        html_content (str): HTML content as string
        output_filename (str): Optional output filename
        
    Returns:
        str: Path to created DOCX file
    """
    
    print("🔄 Converting HTML to DOCX...")
    
    # Create new document
    doc = Document()
    
    # Parse HTML
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Remove script and style tags
    for script in soup(["script", "style"]):
        script.decompose()
    
    # Add title if present
    title = soup.find('title')
    if title and title.string:
        doc.add_heading(title.string.strip(), 0)
    
    # Process content
    body = soup.find('body') if soup.find('body') else soup
    
    # Handle different elements
    for element in body.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
        level = int(element.name[1])
        doc.add_heading(element.get_text().strip(), level)
    
    for element in body.find_all('p'):
        text = element.get_text().strip()
        if text:
            paragraph = doc.add_paragraph()
            
            # Handle basic formatting
            for child in element.children:
                if hasattr(child, 'name') and child.name:
                    if child.name in ['strong', 'b']:
                        run = paragraph.add_run(child.get_text())
                        run.bold = True
                    elif child.name in ['em', 'i']:
                        run = paragraph.add_run(child.get_text())
                        run.italic = True
                    elif child.name == 'u':
                        run = paragraph.add_run(child.get_text())
                        run.underline = True
                    else:
                        paragraph.add_run(child.get_text())
                else:
                    if str(child).strip():
                        paragraph.add_run(str(child))
    
    # Handle lists
    for ul in body.find_all('ul'):
        for li in ul.find_all('li'):
            doc.add_paragraph(li.get_text().strip(), style='List Bullet')
    
    for ol in body.find_all('ol'):
        for li in ol.find_all('li'):
            doc.add_paragraph(li.get_text().strip(), style='List Number')
    
    # Handle tables
    for table in body.find_all('table'):
        rows = table.find_all('tr')
        if rows:
            # Count columns
            max_cols = max(len(row.find_all(['td', 'th'])) for row in rows)
            
            # Create table
            doc_table = doc.add_table(rows=len(rows), cols=max_cols)
            doc_table.style = 'Table Grid'
            
            for row_idx, row in enumerate(rows):
                cells = row.find_all(['td', 'th'])
                for col_idx, cell in enumerate(cells):
                    if col_idx < max_cols:
                        doc_table.cell(row_idx, col_idx).text = cell.get_text().strip()
    
    # Generate filename if not provided
    if not output_filename:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        output_filename = f"rca_document_{timestamp}.docx"
    
    # Save document
    doc.save(output_filename)
    print(f"✅ DOCX document saved: {output_filename}")
    return output_filename

In [None]:
def process_rca_workflow(markdown_file_path: str, template_file_path: str, output_docx_path: str = None) -> str:
    """
    Complete workflow: Read template and markdown, generate HTML, convert to DOCX.
    
    Args:
        markdown_file_path (str): Path to the markdown transcript file
        template_file_path (str): Path to the HTML formatting template file
        output_docx_path (str): Optional path for the output DOCX file
        
    Returns:
        str: Path to the generated DOCX file
    """
    
    print("🚀 Starting RCA Document Generation Workflow")
    print("=" * 60)
    
    # Step 1: Read the formatting template
    print("📄 Step 1: Reading formatting template...")
    formatting_template = read_template_file(template_file_path)
    #print(formatting_template)
    if not formatting_template:
        raise ValueError("Failed to read formatting template")
    
    # Step 2: Read the markdown transcript
    print("📄 Step 2: Reading markdown transcript...")
    transcript_md = read_markdown_file(markdown_file_path)
    if not transcript_md:
        raise ValueError("Failed to read markdown transcript")
    print("Extracting format from template html")
    formatting_template_JSON = extract_formatting_template_fixed(formatting_template)
    
    # Step 3: Generate HTML using Azure OpenAI
    print("🤖 Step 3: Generating HTML with Azure OpenAI...")
    html_content = generate_rca_html(transcript_md, formatting_template_JSON)

    if not html_content:
        raise ValueError("Failed to generate HTML content")
    
    print(f"✅ Generated HTML content ({len(html_content)} characters)")
    
    # Step 4: Convert HTML to DOCX
    print("📝 Step 4: Converting HTML to DOCX...")
    docx_path = convert_html_to_docx(html_content, output_docx_path)
    
    print("🎉 Workflow completed successfully!")
    print(f"📄 Final DOCX document: {docx_path}")
    
    return docx_path

def save_html_debug(html_content: str, filename: str = None) -> str:
    """
    Save HTML content to a file for debugging purposes.
    
    Args:
        html_content (str): HTML content to save
        filename (str): Optional filename for the HTML file
        
    Returns:
        str: Path to the saved HTML file
    """
    if not filename:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"debug_rca_{timestamp}.html"
    
    try:
        with open(filename, 'w', encoding='utf-8') as file:
            file.write(html_content)
        print(f"🐛 Debug HTML saved: {filename}")
        return filename
    except Exception as e:
        print(f"❌ Error saving HTML debug file: {e}")
        return ""



In [None]:




process_rca_workflow("./transcripts/transcript.md","./template_files/template.html","./output/final_rca_document.docx")