In [1]:
%load_ext Cython

In [2]:
import re
import json
from enum import Enum
from docx import Document
from pydantic import BaseModel
from typing import Generator

In [3]:
class Status(Enum):
    TITLE1=1
    TITLE2=2
    TITLE3=3
    TEXT=4
    NONE=5
    FANG=6

In [4]:
def get_content() -> Generator[str,None,None]:
    document = Document("res\苗药方剂学_pre1.docx")
    contents:list[str] = document.paragraphs
    for content in contents:
        if content == "\n" or content == "":
            continue
        yield content.text

In [5]:
title_1 = re.compile("第.+章")
title_2 = re.compile("(一|二|三|四|五|六|七|八|九|十)+、")
title_3 = re.compile("^(\(|（)")
text = re.compile("^\[")
nd = re.compile(".?方(:|：|])")

In [6]:
def classify(txt:str) -> Status:
    if title_1.findall(txt):
        return Status.TITLE1
    elif title_2.findall(txt):
        return Status.TITLE2
    elif title_3.findall(txt):
        return Status.TITLE3
    elif nd.findall(txt):
        return Status.FANG
    elif text.findall(txt):
        return Status.TEXT
    else:
        return Status.NONE

In [7]:
class Node(BaseModel):
    用方:str|None = None
    用法:str|None = None
    方解:str|None = None
    主治:str|None = None
    治则:str|None = None
    属经:str|None = None
    注意:str|None = None

In [22]:
class Profile(BaseModel):
    t1:int = 0
    t2:int = 0
    t3:int = 0
    text:int = 0
    fang:int = 0
    none:int = 0
    用方:int = 0
    用法:int = 0
    方解:int = 0
    主治:int = 0
    治则:int = 0
    属经:int = 0
    注意:int = 0

In [23]:
profile = Profile()

In [28]:
# %%cython
# 用方=1546, 用法=1904, 方解=1911, 主治=398, 治则=439, 属经=431, 注意=1
def node(value:str,previous:dict,node_id:int):
    try:
        nodes = previous["data"]
    except KeyError:
        nodes = list()
        previous["data"] = nodes

    if node_id == 0 or len(nodes) == 0:
        node = Node()
        nodes.append(node)
    elif node_id == 1:
        node = nodes[-1]
    
    value = value.split("]")
    node_txt = str(value[-1])

    if node_id == 0:
        node.用方 = node_txt
    elif str(value[0]).find("用法") != -1:
        node.用法 = node_txt
    elif str(value[0]).find("方解") != -1:
        node.方解 = node_txt
    elif str(value[0]).find("主治") != -1:
        node.主治 = node_txt
    elif str(value[0]).find("治则") != -1 or str(value[0]).find("佳合蒙") != -1:
        node.治则 = node_txt
    elif str(value[0]).find("属经") != -1 or str(value[0]).find("兴冷") != -1:
        node.属经 = node_txt
    elif str(value[0]).find("注意") != -1:
        node.注意 = node_txt
    else:
        print(str(node_id)+":"+node_txt)

    return node_txt

In [29]:
data = dict()
contents = get_content()

# t1=16, t2=212, t3=579, text=5084, fang=1546, none=1057

# profile from 5.4s to 2s 60%+ upper

# 通过短路求值来简化条件语句，减少不必要的条件判断，从而提高代码执行效率。
for value in contents:
    match classify(value):
        case Status.TEXT:
            node_id=1
            txt = node(value,previous,node_id)
        case Status.FANG:
            node_id = 0
            txt = node(value,previous,node_id)
        case Status.NONE:
            try:
                txt = txt + value
            except NameError:
                try:
                    title = previous["title"]
                except KeyError:
                    title = list()
                    previous["title"] = title
                finally:
                    title.append(value)
        case Status.TITLE3:
            try:
                del txt
            except NameError:
                pass
            tt3 = dict()
            tt2[value] = tt3
            previous = tt3
        case Status.TITLE2:
            try:
                del txt
            except NameError:
                pass
            tt2 = dict()
            tt1[value] = tt2
            previous = tt2
        case Status.TITLE1:
            try:
                del txt
            except NameError:
                pass
            tt1 = dict()
            data[value] = tt1
            previous = tt1