Permalink
Find file
Fetching contributors…
Cannot retrieve contributors at this time
executable file 41 lines (34 sloc) 1.04 KB
#!/usr/bin/env python
import sys
import os.path
import mammoth
from bs4 import BeautifulSoup
def docx2html(infile, outfile):
with open(infile) as fp:
mammout = mammoth.convert_to_html(fp)
for m in mammout.messages:
print("Mammoth %s: %s" % (m.type, m.message))
# Prettyprint it
soup = BeautifulSoup(mammout.value, "lxml")
html = soup.prettify().encode("utf-8")
if outfile == "--":
print(html)
else:
with open(outfile, "w") as fp:
print(html, file=fp)
if __name__ == "__main__":
if len(sys.argv) < 2 or len(sys.argv) > 3 \
or sys.argv[1] == "-h" or sys.argv[1] == "--help":
print("Usage: %s infile.docx [outfile.html]")
sys.exit(1)
infile = sys.argv[1]
if len(sys.argv) == 3:
outfile = sys.argv[2]
else:
base, ext = os.path.splitext(infile)
if ext.lower() == ".docx":
outfile = base + ".html"
else:
outfile = infile + ".html"
print("mammoth", infile, outfile)
docx2html(infile, outfile)