In [435]:
#!pip install pdoc weasyprint
#!pip install markdown
#!pip install pdfkit
#https://wkhtmltopdf.org/downloads.html
#!pip install pymupdf

In [436]:
# General setup
import ast
import os
import glob
import re
import pdfkit
import fitz  

# Define folder and output file paths
FOLDER_DOC = "docs"
os.makedirs(FOLDER_DOC, exist_ok=True)
combined_html_path = f'{FOLDER_DOC}/AIzymes_Manual.html'
pdf_path = f'../AIzymes_Manual.pdf'

# Configure pdfkit to use wkhtmltopdf
config = pdfkit.configuration(wkhtmltopdf=r"C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe")

In [437]:
def create_code_html(doc_html, toc_html, section_number, combined_toc_entries):
    def format_text_with_specific_headers(text, keywords):
        formatted_lines = []
        inside_section = False

        for line in text.splitlines():
            line = line.strip()
            if any(line.startswith(keyword + ":") for keyword in keywords):
                section_heading = line.split(":")[0]
                if section_heading == "Modules Required":
                    formatted_lines.append(f"<span style='color:gray; font-weight:bold;'>{section_heading}</span>")
                    formatted_lines.append("<table style='width:100%; border-collapse: collapse;'><tr><th style='text-align:left; width:30%;'> </th><th style='text-align:left;'> </th></tr>")
                else:
                    formatted_lines.append(f"<span style='color:gray; font-weight:bold;'>{section_heading}</span>")
                    formatted_lines.append("<table style='width:100%; border-collapse: collapse;'><tr><th style='text-align:left; width:30%; border-bottom: 1px solid #999;'> </th><th style='text-align:left; border-bottom: 1px solid #999;'> </th></tr>")
                inside_section = True
            elif inside_section and ": " in line:
                param, desc = line.split(": ", 1)
                formatted_param = f"{param.strip()}{' ' * 25}"
                formatted_lines.append(f"<tr><td style='padding-right: 10px; font-size: 12px;'>{formatted_param}</td><td style='font-size: 12px;'>{desc.strip()}</td></tr>")
            elif not line:
                if inside_section:
                    formatted_lines.append("</table><br>")
                    inside_section = False
                formatted_lines.append("<br>")
            else:
                formatted_lines.append(line)

        if inside_section:
            formatted_lines.append("</table><br>")

        return "\n".join(formatted_lines)

    def extract_docstrings(filepath, file_number):
        filename = os.path.basename(filepath)
        formatted_filename = re.sub(r'_\d+\.py$', '', filename)
        
        docstrings = [f"<h2 id='{formatted_filename}'>{section_number}.{file_number} {formatted_filename}</h2>"]
        toc_entries = [f"<li><a href='#{formatted_filename}'>{section_number}.{file_number} {formatted_filename}</a><span style='float:right;'>[page]</span>"]

        with open(filepath, "r") as file:
            tree = ast.parse(file.read())
        
        module_docstring = ast.get_docstring(tree)
        if module_docstring:
            formatted_docstring = format_text_with_specific_headers(module_docstring, ["Args", "Returns", "Functions", "Classes", "Modules Required", "Usage"])
            docstrings.append(f"<p>{formatted_docstring}</p><br>")

        subsection_number = 1
        sub_toc_entries = []
        for node in ast.walk(tree):
            if isinstance(node, (ast.FunctionDef, ast.ClassDef)):
                docstring = ast.get_docstring(node)
                if docstring:
                    section_id = f"{formatted_filename}_{node.name}"
                    header = f"<h3 id='{section_id}'>{section_number}.{file_number}.{subsection_number} {node.name}</h3>"
                    sub_toc_entries.append(f"<li><a href='#{section_id}'>{section_number}.{file_number}.{subsection_number} {node.name}</a><span style='float:right;'>[page]</span></li>")
                    subsection_number += 1
                    formatted_docstring = format_text_with_specific_headers(docstring, ["Args", "Returns", "Functions", "Classes", "Modules Required", "Usage"])
                    docstrings.append(header + formatted_docstring + "<br>")

        # Only add sub-entries if they exist
        if sub_toc_entries:
            toc_entries.append("<ul>" + "\n".join(sub_toc_entries) + "</ul>")
        toc_entries.append("</li>")

        return "\n".join(toc_entries), "\n".join(docstrings)

    combined_docstring_content = []
    combined_toc_entries.append(f"<li><a href='#code'>{section_number}. Code</a><span style='float:right;'>[page]</span><ul>")
    combined_docstring_content.append(f"<h1 id='code'>{section_number}. Code</h1><br>")

    file_number = 1
    for filepath in glob.glob("*.py"):
        if re.search(r"_\d{3}\.py$", filepath):
            toc_entries, docstring_content = extract_docstrings(filepath, file_number)
            combined_toc_entries.append(toc_entries)
            combined_docstring_content.append(docstring_content)
            file_number += 1

    combined_toc_entries.append("</ul></li>")
    toc_html = "\n".join(combined_toc_entries)  # Assign instead of append to avoid accumulation issues
    doc_html += "\n".join(combined_docstring_content)

    return doc_html, toc_html, combined_toc_entries


In [438]:
def create_section_html(section, doc_html, toc_html, section_number, combined_toc_entries):
    filepath = f"docs/{section}.md"
    
    with open(filepath, "r") as file:
        content = file.read()

    # Replace multiple newlines with a single newline for clean HTML formatting
    content = re.sub(r'\n\s*\n', '\n', content)

    section_id = section.lower().replace(" ", "_")
    processed_lines = [f"<h1 id='{section_id}'>{section_number}. {section}</h1>"]
    toc_html += f"<li><a href='#{section_id}'>{section_number}. {section}</a><span style='float:right;'>[page]</span></li>"

    subheader_count = 1
    subsubheader_count = 1

    for line in content.splitlines():
        # Detect figure label and add anchor
        if line.startswith("<!-- Figure: "):
            figure_id = line.split(":")[1].strip(" -->")
            anchor = f"<a id='{figure_id}'></a>"
            processed_lines.append(anchor)

        # Detect image Markdown and convert to centered HTML <img>
        elif line.startswith("!["):
            match = re.match(r"!\[(.*?)\]\((.*?)\)", line)
            if match:
                alt_text, img_path = match.groups()
                processed_lines.append(f"<img src='{img_path}' alt='{alt_text}' style='display: block; margin: 0 auto;'>")
        
        # Detect and format the caption line
        elif line.startswith("<b>Fig. "):            
            # Format the caption with .figure-caption styling
            caption_html = (
                f"<p class='figure-caption'>"
                f"{line}"
                f"</p>"
            )
            processed_lines.append(caption_html)

        # Process level 2 headers (subheaders)
        elif line.startswith("## "):
            subheader = line[3:].strip()
            subheader_id = subheader.lower().replace(" ", "_")
            processed_lines.append(f"<h2 id='{subheader_id}'>{section_number}.{subheader_count} {subheader}</h2>")
            toc_html += f"<ul><li><a href='#{subheader_id}'>{section_number}.{subheader_count} {subheader}</a><span style='float:right;'>[page]</span></li></ul>"
            subsubheader_count = 1  # Reset sub-subheader count for each new subheader
            subheader_count += 1

        # Process level 3 headers (sub-subheaders)
        elif line.startswith("### "):
            subsubheader = line[4:].strip()
            subsubheader_id = subsubheader.lower().replace(" ", "_")
            processed_lines.append(f"<h3 id='{subsubheader_id}'>{section_number}.{subheader_count - 1}.{subsubheader_count} {subsubheader}</h3>")
            toc_html += f"<ul><ul><li><a href='#{subsubheader_id}'>{section_number}.{subheader_count - 1}.{subsubheader_count} {subsubheader}</a><span style='float:right;'>[page]</span></li></ul></ul>"
            subsubheader_count += 1
        
        else:
            processed_lines.append(line)

    # Combine processed lines into doc_html to keep order correct
    doc_html += "<br>".join(processed_lines)

    return doc_html, toc_html, combined_toc_entries


In [439]:
def finalize_toc(toc_html):
    return f"<h1>Table of Contents</h1><ul>{toc_html}</ul>"

In [440]:
sections = ["Cover", "Authors", "Installation", "Introduction", "Code", "DeveloperNotes"]
sections = ["Introduction", "Code"]

cover_html = ""
toc_html = ""
doc_html = ""
combined_toc_entries = []

for section_number, section in enumerate(sections):

    print(section_number, section)

    if section == "Code":
        doc_html, toc_html, combined_toc_entries = create_code_html(doc_html, toc_html, section_number+1, combined_toc_entries)
    elif section == "Cover":
        cover_html = create_cover_html()
    else:
        doc_html, toc_html, combined_toc_entries = create_section_html(section, doc_html, toc_html, section_number+1, combined_toc_entries)

toc_html = finalize_toc(toc_html)

0 Introduction
1 Code


In [None]:
# Generate HTML
html_content = cover_html + "<br>" + toc_html + "<br>" + doc_html

css = """
<style>
    body { font-family: Calibri, sans-serif; font-size: 14px; margin: 0; } 
    h1, h2, h3, ul, li { color: #333; }
    h1 { color: #4CAF50; font-size: 1.5em; margin-top: 0; margin-bottom: 0.0em; } 
    h2 { color: #4CAF50; font-size: 1.2em; margin-top: 0.0em; margin-bottom: 0; } 
    h3 { font-size: 1.0em; margin-top: 0.0em; margin-bottom: 0; } 
    ul { padding-left: 20px; list-style-type: none; }
    li { margin-bottom: 0.0em; font-size: 1.0em; }
    p { margin-top: 0.0em; margin-bottom: 0.0em; } 
    table, th, td  { font-size: 1.0em; }
    .figure-caption { font-size: 0.8em; text-align: justify; margin-top: 0.0em; margin-bottom: 0.0em; }
</style>
"""

html_content = css + html_content

with open(combined_html_path, "w") as f:
    f.write(html_content)


In [442]:
# Remove existing PDF to be sure
if os.path.exists(pdf_path):
    os.remove(pdf_path)

# Generate PDF with page numbers
pdfkit.from_file(combined_html_path, pdf_path, configuration=config, options={
    'quiet': '',
    'dpi': 300,
    'disable-smart-shrinking': '',
    'enable-local-file-access': '',
    'margin-top': '1.5cm',
    'margin-bottom': '1.5cm',
    'margin-left': '2cm',
    'margin-right': '2cm'
})

# Analyze the draft PDF to find page numbers for each section
doc = fitz.open(pdf_path)
toc_page_numbers = {}

# Gather TOC identifiers
toc_identifiers = [re.search(r">(1\.\d+ \w+)<", entry).group(1) for entry in combined_toc_entries if re.search(r">(1\.\d+ \w+)<", entry)]

# Find page numbers
for page_num in range(doc.page_count):
    page = doc.load_page(page_num)
    text = page.get_text("text")
    
    for identifier in toc_identifiers:
        if identifier in text:
            toc_page_numbers[identifier] = page_num + 1

# Explicitly close the PDF document to release the file
doc.close()

# Update HTML with actual page numbers
for identifier, page_num in toc_page_numbers.items():
    html_content = re.sub(rf"({re.escape(identifier)}.*?)\[page\]", rf"\g<1>{page_num}", html_content)
    html_content = re.sub(rf"({re.escape(identifier)}\.\d+\s*)\[page\]", rf"\g<1>{page_num}", html_content)

# Save the final HTML with page numbers
with open(combined_html_path, "w") as f:
    f.write(html_content)

# Generate the final PDF with updated page numbers
pdfkit.from_file(combined_html_path, pdf_path, configuration=config, options={
    'quiet': '',
    'dpi': 300,
    'disable-smart-shrinking': '',
    'enable-local-file-access': '',
    'footer-center': '[page]',
    'footer-font-name': 'Calibri',
    'footer-font-size': '10',
    'margin-top': '1.5cm',
    'margin-bottom': '1.5cm',
    'margin-left': '2cm',
    'margin-right': '2cm'
})

print(f"Final PDF generated at {pdf_path}")

Final PDF generated at ../AIzymes_Manual.pdf
