In [29]:
import mammoth
from bs4 import BeautifulSoup
import os

In [30]:
input_directory = "2023"  # Directory containing DOCX files
output_directory = "output_html"  # Directory to store the output HTML files

custom_styles = """table => table.usa-table"""

def ignore_image(image):
    return []

# Create output directory if it does not exist
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Get list of DOCX files in input directory
input_filenames = [f for f in os.listdir(input_directory) if f.endswith('.docx')]

for input_filename in input_filenames:
    input_filepath = os.path.join(input_directory, input_filename)
    output_filename = os.path.join(output_directory, f"{os.path.splitext(input_filename)[0]}_ugly_output.html")
    pretty_filename = os.path.join(output_directory, f"{os.path.splitext(input_filename)[0]}_pretty_output.html")

    with open(input_filepath, "rb") as docx_file:
        result = mammoth.convert_to_html(docx_file, style_map=custom_styles, convert_image=ignore_image)
        html = result.value

    with open(output_filename, "w", encoding="utf-8") as html_file:
        html_file.write(html)

    # Create an instance of BeautifulSoup and assign it to the variable 'soup'
    soup = BeautifulSoup(html, "html.parser")

    # Find all tables and wrap them in a div with a custom style
    for table in soup.find_all("table"):
        div = soup.new_tag("div", **{"class": "usa-table-container--scrollable"})
        table.wrap(div)
        
        first_row = table.find("tr")
        if first_row:
            thead = soup.new_tag("thead")
            tbody = soup.new_tag("tbody")

            # Move all tr tags from table to tbody, then move the first tr to thead
            for row in table.find_all("tr"):
                tbody.append(row)
            thead.append(tbody.contents[0])

            # Remove existing tr elements from table and append thead and tbody
            table.clear()
            table.append(thead)
            table.append(tbody)

            # Remove any <p> elements within tables
            for p in table.find_all("p"):
                p.unwrap()

            # Right-align the td elements in the thead, except for the first column
            for row in thead.find_all("tr"):
                for i, cell in enumerate(row.find_all("td")):
                    if i > 0:  # Skip the first column
                        cell['style'] = 'text-align: right;'

            # Right-align the td elements in the tbody, except for the first column
            for row in tbody.find_all("tr"):
                for i, cell in enumerate(row.find_all("td")):
                    if i > 0:  # Skip the first column
                        cell['style'] = 'text-align: right;'

    # Prettify the HTML
    pretty_html = soup.prettify()

    # Write the prettified HTML to a new file
    with open(pretty_filename, "w", encoding="utf-8") as file:
        file.write(pretty_html)

    print(f"Prettified HTML content for {input_filename} is at {pretty_filename}")


Prettified HTML content for Table 1.docx is at output_html\Table 1_pretty_output.html
Prettified HTML content for Table 2.docx is at output_html\Table 2_pretty_output.html
Prettified HTML content for Table 3.docx is at output_html\Table 3_pretty_output.html
Prettified HTML content for Table 4.docx is at output_html\Table 4_pretty_output.html
Prettified HTML content for Table 5.docx is at output_html\Table 5_pretty_output.html
Prettified HTML content for Table 6.docx is at output_html\Table 6_pretty_output.html
