In [18]:
import docx
import docx.document
import docx.text.paragraph

FILENAME = "sample_input.docx"
doc: docx.document.Document = docx.Document(FILENAME)

def get_content(p: docx.text.paragraph.Paragraph) -> tuple:
    return (p.text, p.style)

ps = list(map(get_content, doc.paragraphs))
with open("sample_output.txt", "w") as f:
    for p in ps:
        f.write(str(p) + "\n")

In [19]:
class Question:
    level: int
    content: str
    options: str | list[str]
    comment: list[str]
    image: str | None
    option_is_image: bool
    
    def __init__(self, *, level: int = 0, content: str = "", options: list[str] = [], comment: list[str] = [], image: str | None = None, option_is_image: bool = False) -> None:
        self.level = level
        self.content = content
        self.options = options.copy()
        self.comment = comment.copy()
        self.image = image
        self.option_is_image = option_is_image
        

In [20]:
import docx.opc.rel
import docx.parts.image
import docx.oxml.shape
import os, re

def get_image(rid: str) -> bytes:
    rels = doc.part.rels
    rel: docx.opc.rel._Relationship = rels[rid]
    if "image" in rel.target_ref:
        part: docx.parts.image.ImagePart = rel.target_part
        return part.blob

os.makedirs("images", exist_ok=True)


questions: list[Question] = []
questions.append(Question(level=1, content=FILENAME))

for p in doc.paragraphs:
    text = str(p.text)
    
    if text.startswith("\u3000"):
        continue
    elif text.startswith("(   )"):
        text = text.strip("(   )")
        if text.find("(Ａ)") > 0:
            text_s = text.split("(Ａ)")
            questions.append(Question(content=text_s[0].strip("\u3000")))
            opt = re.split(r"\([ＡＢＣＤ]\)", text_s[1])
            for i in range(len(opt)):
                questions[-1].options.append(opt[i].strip("\u3000"))
        else:
            questions.append(Question(content=text.strip("\u3000")))
    elif text.startswith("單"):
        questions.append(Question(level=2, content="單一選擇題"))
        continue
    elif text.find("(Ａ)") >= 0:
        opt = re.split(r"\([ＡＢＣＤ]\)", text)
        for i in range(1, len(opt)):
            questions[-1].options.append(opt[i].strip("\u3000"))
    elif len(text) > 0:
        questions[-1].comment.append(text)
            
    cts: list[docx.oxml.shape.CT_Blip] = p._element.xpath(".//a:blip")
    for ct in cts:
        rid: str = ct.attrib.values()[0]
        with open(f"images/{rid}.png", "wb") as f:
            f.write(get_image(rid))
            
    if len(cts) == 1:
        questions[-1].image = cts[0].attrib.values()[0]
    elif len(cts) == 4:
        questions[-1].option_is_image = True
        questions[-1].options = list(map(lambda x: x.attrib.values()[0], cts))

In [58]:
ol_lock = False

full_alph = ["\u0041", "\u0042", "\u0043", "\u0044", "\u0045", "\u0046"]

with open("sample_output.html", "w") as html:
    html.write('<!DOCTYPE html><html><body><head><style>ol{margin-left:-2.2em;list-style:none;counter-reset:counter;}ol>li{counter-increment:counter;}ol>li::before{content:counter(counter)". (　)";}ol>li>div{margin-top:-2.35em;margin-left:3.5em;}</style></head>')
    for q in questions:
        if q.level > 0:
            if ol_lock:
                html.write("</ol>")
                ol_lock = False
            html.write(f"<h{q.level}>{q.content}</h{q.level}>")
        else:
            if not ol_lock:
                html.write("<ol>")
                ol_lock = True
            html.write("<li><div>")
            if q.image:
                html.write(f'<img src="./images/{q.image}.png" style="float:right" width="150px" />')
            html.write(f"<p>{q.content}<br>")
            if len(q.comment) > 0:
                html.write("</p><p><small>")
                for cmt in q.comment:
                    html.write(f"{cmt}<br>")
                html.write("</small></p><p>")
            for i in range(len(q.options)):
                if (q.option_is_image):
                    if (i == 0): html.write("<p>")
                    html.write(f'({full_alph[i]}) <img src="./images/{q.options[i]}.png" width="120px" style="vertical-align:top">\u3000')
                    if (i == 3): html.write("</p>")
                else:
                    html.write(f"({full_alph[i]}){q.options[i]}\u3000")
            if q.image:
                html.write('<br clear="right" />')
            html.write("</p></div></li>")
    html.write("</body></html>")
    