In [10]:
import mammoth
from bs4 import BeautifulSoup
import os
from docx import Document

In [11]:
input_filename = "tsa_report.docx"
output_filename = "ugly_output.html"
pretty_filename = "pretty_output.html"
images_folder = "images"

custom_styles = """ table => table.usa-table
                    """

In [12]:
# Create images folder if it doesn't exist
if not os.path.exists(images_folder):
    os.makedirs(images_folder)

def extract_images(docx_file):
    document = Document(docx_file)
    image_index = 0
    image_info = []

    for rel in document.part.rels.values():
        if "image" in rel.target_ref:
            image = rel.target_part.blob
            image_extension = os.path.splitext(rel.target_ref)[-1].replace('.', '')
            image_name = f"image_{image_index}.{image_extension}"
            image_path = os.path.join(images_folder, image_name)
            
            # Save image to folder
            with open(image_path, "wb") as image_file:
                image_file.write(image)
            
            alt_text = rel.target_part.alt_text if hasattr(rel.target_part, 'alt_text') else "No alt text available"
            image_info.append((image_name, alt_text))
            image_index += 1

    return image_info

with open(input_filename, "rb") as docx_file:
    # Extract images manually
    images = extract_images(docx_file)
    print(f"Extracted {len(images)} images")

    # Convert DOCX to HTML without images
    result = mammoth.convert_to_html(docx_file, style_map=custom_styles)
    html = result.value

# Parse the HTML with BeautifulSoup
soup = BeautifulSoup(html, "html.parser")

# Remove links that start with <a id="_
links_to_remove = soup.find_all("a", id=lambda x: x and x.startswith("_"))
for link in links_to_remove:
    link.decompose()

# Add aria-label to each footnote and endnote reference
footnote_refs = soup.find_all("a", href=lambda x: x and x.startswith("#footnote-ref"))
endnote_refs = soup.find_all("a", href=lambda x: x and x.startswith("#endnote-ref"))

for i, footnote_ref in enumerate(footnote_refs, start=1):
    aria_label = f"Back to footnote {i}"
    footnote_ref["aria-label"] = aria_label

for i, endnote_ref in enumerate(endnote_refs, start=1):
    aria_label = f"Back to endnote {i}"
    endnote_ref["aria-label"] = aria_label

# Add bolded paragraph with image alt text only and remove <p> wrapping
for image in soup.find_all("img"):
    alt_text = image.get("alt", "No alt text available")
    bolded_paragraph = soup.new_tag("p")
    bolded_paragraph.string = f"[!! Image goes here !!] Alt text: {alt_text}"
    image.replace_with(bolded_paragraph)

# Find all tables and wrap them in a div with a custom style
for table in soup.find_all("table"):
    div = soup.new_tag("div", **{"class": "usa-table-container--scrollable", "tabindex": "0"})
    table.wrap(div)

    first_row = table.find("tr")
    if first_row:
        thead = soup.new_tag("thead")
        tbody = soup.new_tag("tbody")

        # Move all tr tags from table to tbody, then move the first tr to thead
        for row in table.find_all("tr"):
            tbody.append(row)
        thead.append(tbody.contents[0])

        # Remove existing tr elements from table and append thead and tbody
        table.clear()
        table.append(thead)
        table.append(tbody)

        # Add scope="col" to all th elements in the thead and tbody
        for row in thead.find_all("tr"):
            for cell in row.find_all(["th", "td"]):
                cell.name = "th"
                cell['scope'] = 'col'

        for row in tbody.find_all("tr"):
            for i, cell in enumerate(row.find_all(["th", "td"])):
                if cell.name == "th":
                    cell['scope'] = 'col'

        # Remove any <p> elements within tables
        for p in table.find_all("p"):
            p.unwrap()

        # Right-align the td elements in the tbody, except for the first column
        for row in tbody.find_all("tr"):
            for i, cell in enumerate(row.find_all("td")):
                if i > 0:  # Skip the first column
                    cell['style'] = 'text-align: right;'

pretty_html = soup.prettify()

with open(pretty_filename, "w", encoding="utf-8") as file:
    file.write(pretty_html)

print(f"Prettified HTML content is at {pretty_filename}")


Extracted 9 images
Prettified HTML content is at pretty_output.html
