In [34]:
import mammoth
from bs4 import BeautifulSoup
import os
import base64

In [35]:
input_filename = "tsa_report.docx"
output_filename = "ugly_output.html"
pretty_filename = "pretty_output.html"

custom_styles = """ table => table.usa-table
                    """

In [36]:
# Create the "images" folder if it doesn't already exist
os.makedirs("images", exist_ok=True)

image_counter = 0

with open(input_filename, "rb") as docx_file:
    # Convert DOCX to HTML without images
    result = mammoth.convert_to_html(docx_file, style_map=custom_styles)
    html = result.value

# Parse the HTML with BeautifulSoup
soup = BeautifulSoup(html, "html.parser")

# Extract images from DOCX and save them as files in the "images" folder
for image in soup.find_all("img"):
    image_data = image.get("src")  # Base64 encoded image data
    if image_data:
        # Decode the Base64 encoded image
        image_binary = base64.b64decode(image_data.split(",")[1])

        # Save the image to the "images" folder
        image_counter += 1
        new_image_filename = f"images/image_{image_counter}.png"
        with open(new_image_filename, "wb") as img_file:
            img_file.write(image_binary)

        # Update the HTML with a reference to the new image file name
        alt_text = image.get("alt", "No alt text available")
        bolded_paragraph = soup.new_tag("p")
        bolded_paragraph.string = f"[!! Image goes here !!] Alt text: {alt_text} (File: {new_image_filename})"
        image.replace_with(bolded_paragraph)
        
print(f"Total images extracted: {image_counter}")

# Find all tables and wrap them in a div with a custom style
for table in soup.find_all("table"):
    div = soup.new_tag("div", **{"class": "usa-table-container--scrollable", "tabindex": "0"})
    table.wrap(div)

    first_row = table.find("tr")
    if first_row:
        thead = soup.new_tag("thead")
        tbody = soup.new_tag("tbody")

        # Move all tr tags from table to tbody, then move the first tr to thead
        for row in table.find_all("tr"):
            tbody.append(row)
        thead.append(tbody.contents[0])

        # Remove existing tr elements from table and append thead and tbody
        table.clear()
        table.append(thead)
        table.append(tbody)

        # Add scope="col" to all th elements in the thead and tbody
        for row in thead.find_all("tr"):
            for cell in row.find_all(["th", "td"]):
                cell.name = "th"
                cell['scope'] = 'col'

        for row in tbody.find_all("tr"):
            for i, cell in enumerate(row.find_all(["th", "td"])):
                if cell.name == "th":
                    cell['scope'] = 'col'

        # Remove any <p> elements within tables
        for p in table.find_all("p"):
            p.unwrap()

        # Right-align the td elements in the tbody, except for the first column
        for row in tbody.find_all("tr"):
            for i, cell in enumerate(row.find_all("td")):
                if i > 0:  # Skip the first column
                    cell['style'] = 'text-align: right;'
                    
# Add aria-label to each footnote and endnote reference
footnote_refs = soup.find_all("a", href=lambda x: x and x.startswith("#footnote-ref"))
endnote_refs = soup.find_all("a", href=lambda x: x and x.startswith("#endnote-ref"))

for i, footnote_ref in enumerate(footnote_refs, start=1):
    aria_label = f"Back to footnote {i}"
    footnote_ref["aria-label"] = aria_label
    footnote_ref.string = "↩ Back"

for i, endnote_ref in enumerate(endnote_refs, start=1):
    aria_label = f"Back to endnote {i}"
    endnote_ref["aria-label"] = aria_label
    endnote_ref.string = "↩ Back"
    
# Remove links that start with <a id="_ to clean up residual toc anchors
links_to_remove = soup.find_all("a", id=lambda x: x and x.startswith("_"))
for link in links_to_remove:
    link.decompose()    
    
# Make pretty HTML
pretty_html = soup.prettify()

with open(pretty_filename, "w", encoding="utf-8") as file:
    file.write(pretty_html)

print(f"Prettified HTML content is at {pretty_filename}")


Total images extracted: 9
Prettified HTML content is at pretty_output.html
