In [201]:
import mammoth
from bs4 import BeautifulSoup
import os
from docx import Document

In [202]:
input_filename = "tsa_report.docx"
output_filename = "ugly_output.html"
pretty_filename = "pretty_output.html"

custom_styles = """ table => table.usa-table.usa-table--stacked
                    """

In [203]:
# Create images folder if it doesn't exist
if not os.path.exists(images_folder):
    os.makedirs(images_folder)

def extract_images(docx_file):
    document = Document(docx_file)
    image_index = 0
    image_info = []

    for rel in document.part.rels.values():
        if "image" in rel.target_ref:
            image = rel.target_part.blob
            image_extension = os.path.splitext(rel.target_ref)[-1].replace('.', '')
            image_name = f"image_{image_index}.{image_extension}"
            image_path = os.path.join(images_folder, image_name)
            
            # Save image to folder
            with open(image_path, "wb") as image_file:
                image_file.write(image)
            
            alt_text = rel.target_part.alt_text if hasattr(rel.target_part, 'alt_text') else "No alt text available"
            image_info.append((image_name, alt_text))
            image_index += 1

    return image_info

image_index = 0

with open(input_filename, "rb") as docx_file:
    # Extract images manually
    images = extract_images(docx_file)
    print(f"Extracted {len(images)} images")

    # Convert DOCX to HTML without images
    result = mammoth.convert_to_html(docx_file, style_map=custom_styles)
    html = result.value

# Parse the HTML with BeautifulSoup
soup = BeautifulSoup(html, "html.parser")

# Add comments with image alt text only and remove <p> wrapping
for image in soup.find_all("img"):
    alt_text = image.get("alt", "No alt text available")
    comment = soup.new_tag("comment")
    comment.string = f" Alt text: {alt_text} "
    image.replace_with(comment)

# Remove any <p> elements that wrap comments
for p in soup.find_all("p"):
    if p.string and "Alt text:" in p.string:
        p.unwrap()

# Find all tables and convert the first row to thead
for table in soup.find_all("table"):
    first_row = table.find("tr")
    if first_row:
        thead = soup.new_tag("thead")
        tbody = soup.new_tag("tbody")

        # Move all tr tags from table to tbody, then move the first tr to thead
        for row in table.find_all("tr"):
            tbody.append(row)
        thead.append(tbody.contents[0])
        
        # Remove existing tr elements from table and append thead and tbody
        table.clear()
        table.append(thead)
        table.append(tbody)
        
        # Remove any <p> elements within tables
        for p in table.find_all("p"):
            p.unwrap()

pretty_html = soup.prettify()

with open(pretty_filename, "w", encoding="utf-8") as file:
    file.write(pretty_html)

print(f"Prettified HTML content is at {pretty_filename}")


Extracted 9 images
Prettified HTML content is at pretty_output.html
