In [34]:
#!pip install pdoc weasyprint
#!pip install markdown
#!pip install pdfkit
#https://wkhtmltopdf.org/downloads.html
#!pip install pymupdf

In [1]:
import ast
import os
import glob
import re
import pdfkit
import fitz  

# Define folder and output file paths
FOLDER_DOC = "docs"
os.makedirs(FOLDER_DOC, exist_ok=True)
combined_html_path = f'{FOLDER_DOC}/AIzymes_Manual.html'
pdf_path = f'../AIzymes_Manual.pdf'

# Configure pdfkit to use wkhtmltopdf
config = pdfkit.configuration(wkhtmltopdf=r"C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe")

# Function to format specific headers like Parameters and Returns using Markdown tables
def format_text_with_specific_headers(text, keywords):
    formatted_lines = []
    inside_section = False

    for line in text.splitlines():
        line = line.strip()

        if any(line.startswith(keyword + ":") for keyword in keywords):
            # Check if the line is "Modules Required"
            section_heading = line.split(":")[0]
            if section_heading == "Modules Required":
                formatted_lines.append(f"<br><span style='color:gray; font-weight:bold;'>{section_heading}</span><br>")
                formatted_lines.append("<table style='width:100%; border-collapse: collapse;'><tr><th style='text-align:left; width:30%;'> </th><th style='text-align:left;'> </th></tr>")
            else:
                # Use a bordered table for other sections
                formatted_lines.append(f"<br><span style='color:gray; font-weight:bold;'>{section_heading}</span><br>")
                formatted_lines.append("<table style='width:100%; border-collapse: collapse;'><tr><th style='text-align:left; width:30%; border-bottom: 1px solid #999;'> </th><th style='text-align:left; border-bottom: 1px solid #999;'> </th></tr>")
            inside_section = True

        elif inside_section and ": " in line:
            param, desc = line.split(": ", 1)
            formatted_param = f"{param.strip()}{' ' * 25}"
            formatted_lines.append(f"<tr><td style='padding-right: 10px; font-size: 12px;'>{formatted_param}</td><td style='font-size: 12px;'>{desc.strip()}</td></tr>")
        
        elif not line:
            if inside_section:
                formatted_lines.append("</table><br>")  # Close table at the end of section
                inside_section = False
            formatted_lines.append("<br>")
        
        else:
            formatted_lines.append(line)

    if inside_section:
        formatted_lines.append("</table><br>")

    return "\n".join(formatted_lines)


# Function to extract docstrings and generate ToC entries with numbered headings
def extract_docstrings(filepath, file_number):
    filename = os.path.basename(filepath)
    formatted_filename = re.sub(r'_\d+\.py$', '', filename)
    
    docstrings = [f"<h2 id='{formatted_filename}'>1.{file_number} {formatted_filename}</h2><br>"]
    toc_entries = [f"<li><a href='#{formatted_filename}'>1.{file_number} {formatted_filename}</a><span style='float:right;'>[page]</span></li>"]

    with open(filepath, "r") as file:
        tree = ast.parse(file.read())
    
    # Process the module-level docstring if it exists
    module_docstring = ast.get_docstring(tree)
    if module_docstring:
        formatted_docstring = format_text_with_specific_headers(module_docstring, ["Args", "Returns", "Functions", "Classes", "Modules Required","Usage"])
        docstrings.append(f"<p>{formatted_docstring}</p><br>")

    section_number = 1

    # Traverse the AST for functions and classes to apply consistent formatting
    for node in ast.walk(tree):
        if isinstance(node, (ast.FunctionDef, ast.ClassDef)):
            docstring = ast.get_docstring(node)
            if docstring:
                # Build the section and ToC entry
                section_id = f"{formatted_filename}_{node.name}"
                header = f"<h3 id='{section_id}'>1.{file_number}.{section_number} {node.name}</h3><br>"
                toc_entries.append(f"<li style='margin-left: 20px;'><a href='#{section_id}'>1.{file_number}.{section_number} {node.name}</a><span style='float:right;'>[page]</span></li>")
                section_number += 1

                # Format docstring with specified headers consistently at all levels
                formatted_docstring = format_text_with_specific_headers(docstring, ["Args", "Returns", "Functions", "Classes", "Modules Required","Usage"])
                docstrings.append(header + formatted_docstring + "<br>")

    return "\n".join(toc_entries), "\n".join(docstrings)

# Collect docstrings and ToC from all matching files
combined_docstring_content = []
combined_toc_entries = ["<h1>Table of Contents</h1><ul>", "<li><a href='#code'>1. Code</a></li>"]
combined_docstring_content.append("<h1 id='code'>1. Code</h1><br>")

file_number = 1
for filepath in glob.glob("*.py"):
    if re.search(r"_\d{3}\.py$", filepath):
        toc_entries, docstring_content = extract_docstrings(filepath, file_number)
        combined_toc_entries.append(toc_entries)
        combined_docstring_content.append(docstring_content)
        file_number += 1

combined_toc_entries.append("</ul>")
toc_html = "\n".join(combined_toc_entries)
doc_html = "\n".join(combined_docstring_content)

# Combine ToC and docstring content
html_content = toc_html + "<br>" + doc_html

# CSS for styling and page numbering
css = """
<style>
    body { font-family: Calibri, sans-serif; font-size: 12px; margin: 20px; }
    h1, h2, h3, ul, li { color: #333; }
    h1 { color: #4CAF50; font-size: 1.5em; margin-bottom: 0; }
    h2 { color: #4CAF50; font-size: 1.2em; margin-bottom: 0; }
    h3 { font-size: 1.0em; margin-bottom: 0; }
    ul { padding-left: 20px; list-style-type: none; }
    li { margin-bottom: 5px; font-size: 12px; }
    @page { size: A4; margin: 1in; }
    @page { @bottom-right { content: "Page " counter(page); } }
</style>
"""

html_content = css + html_content

with open(combined_html_path, "w") as f:
    f.write(html_content)

# Generate PDF with page numbers
pdfkit.from_file(combined_html_path, pdf_path, configuration=config, options={
    'quiet': '',
    'dpi': 300,
    'disable-smart-shrinking': '',
    'enable-local-file-access': '',
})

# Analyze the draft PDF to find page numbers for each section
doc = fitz.open(pdf_path)
toc_page_numbers = {}

toc_identifiers = [re.search(r">(1\.\d+ \w+)<", entry).group(1) for entry in combined_toc_entries if re.search(r">(1\.\d+ \w+)<", entry)]

for page_num in range(doc.page_count):
    page = doc.load_page(page_num)
    text = page.get_text("text")
    
    for identifier in toc_identifiers:
        if identifier in text:
            toc_page_numbers[identifier] = page_num + 1

# Update HTML with actual page numbers
for identifier, page_num in toc_page_numbers.items():
    html_content = re.sub(rf"({re.escape(identifier)}.*?)\[page\]", rf"\g<1>{page_num}", html_content)
    html_content = re.sub(rf"({re.escape(identifier)}\.\d+\s*)\[page\]", rf"\g<1>{page_num}", html_content)

# Save the final HTML with page numbers
with open(combined_html_path, "w") as f:
    f.write(html_content)

# Generate the final PDF with updated page numbers
pdfkit.from_file(combined_html_path, pdf_path, configuration=config, options={
    'quiet': '',
    'dpi': 300,
    'disable-smart-shrinking': '',
    'enable-local-file-access': '',
    'footer-center': '[page]',
    'footer-font-name': 'Calibri',
    'footer-font-size': '12',
})

print(f"Final PDF generated at {pdf_path}")

Final PDF generated at ../AIzymes_Manual.pdf
