In [66]:
import os
import pandas as pd
import re
import pymysql
import cn2an
import json
from IPython.display import display
from datetime import datetime
from pathlib import Path

In [None]:
# 5.5
# 提取卷信息和页码信息，并以DataFrame形式输出，同时检查页码连续性与字段缺失
# 设置要检查的文件夹路径
folder_path = Path(r"C:\Users\cuiwe\Desktop\212")

# 中缝标签正则表达式
zhongfeng_pattern = re.compile(r'<中缝>\s*(.*?)\s*<中缝/>', re.DOTALL)

records = []

# 遍历所有TXT文件
for file in sorted(folder_path.glob('*.txt')):
    content = file.read_text(encoding='utf-8')
    matches = zhongfeng_pattern.findall(content)

    row = {
        "文件名": file.name,
        "卷": None,
        "页码": None,
        "是否缺少中缝的卷信息": True,
        "是否缺少页码信息": True
    }

    # 卷信息
    if len(matches) >= 1:
        row["卷"] = matches[0]
        row["是否缺少中缝的卷信息"] = False

    # 页码信息
    if len(matches) >= 2 and matches[1].strip():
        try:
            page_num = cn2an.cn2an(matches[1].strip(), mode='strict')
            row["页码"] = page_num
            row["是否缺少页码信息"] = False
        except:
            pass

    records.append(row)

# 构建DataFrame
df = pd.DataFrame(records)

# 页码连续性检查（仅取唯一、有效页码）
valid_pages = sorted(df["页码"].dropna().unique())
expected = 1
for num in valid_pages:
    if num != expected:
        print(f"不连续: 页码 {expected} → {num}")
        expected = num
    expected += 1

# 输出带样式的DataFrame（仅一项）
styled_df = df.style.highlight_null(props='background-color: red')
display(styled_df)

# 输出唯一卷信息（以表格方式展示）
unique_juans_df = pd.DataFrame(df["卷"].dropna().unique(), columns=["所出现的卷"])
display(unique_juans_df)

Unnamed: 0,文件名,卷,页码,是否缺少中缝的卷信息,是否缺少页码信息
0,000001.txt,永樂大典卷二千二百七十二,1.0,False,False
1,000002.txt,永樂大典卷二千二百七十二,2.0,False,False
2,000003.txt,永樂大典卷三千二百七十二,3.0,False,False
3,000004.txt,永樂大典卷二千二百七十二,4.0,False,False
4,000005.txt,永樂大典卷二千二百七十二,5.0,False,False
5,000006.txt,永樂大典卷二千二百七十二,6.0,False,False
6,000007.txt,永樂大典卷二千二百七十二,7.0,False,False
7,000008.txt,永樂大典卷二千二百七十二,8.0,False,False
8,000009.txt,永樂大典卷二千二百七十二,9.0,False,False
9,000010.txt,永樂大典卷二千二百七十二,10.0,False,False


Unnamed: 0,所出现的卷
0,永樂大典卷二千二百七十二
1,永樂大典卷三千二百七十二
2,永樂大典卷二千二百七十三
3,永樂大典卷二千二百七十四
4,永樂太典卷二千二百七十四


In [None]:
# 设置文件夹路径
folder_path = r'C:\Users\cuiwe\Desktop\212'

# 设置输出文件名（用户可自定义）
output_file_name = '从28到47.txt'
output_file_path = os.path.join(folder_path, output_file_name)

# 设置起始与结束文件编号（000001 到 000014）
start_num = 28
end_num = 47

# 用于存储合并内容的列表（逐行）
combined_lines = []

# 遍历每个文件并读取内容
for num in range(start_num, end_num + 1):
    file_name = f"{num:06}.txt"
    file_path = os.path.join(folder_path, file_name)

    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    # 移除 <页码> 和 </页码> 标签
    cleaned_lines = []
    for line in lines:
        line = re.sub(r"<页码.*?>", "", line)
        line = re.sub(r"</页码>", "", line)

        if line.strip():
            cleaned_lines.append(line)

    # 添加清洗后的行
    combined_lines.extend(cleaned_lines)

    # 每个文件后添加一个空行
    combined_lines.append("\n")

    print(f"{file_name} 合并完成，共添加 {len(cleaned_lines)} 行")

# 保存最终合并后的文本文件
with open(output_file_path, 'w', encoding='utf-8') as out_file:
    out_file.writelines(combined_lines)

print(f"合并文件已保存至: {output_file_path}")


000028.txt 合并完成，共添加 44 行
000029.txt 合并完成，共添加 42 行
000030.txt 合并完成，共添加 42 行
000031.txt 合并完成，共添加 42 行
000032.txt 合并完成，共添加 42 行
000033.txt 合并完成，共添加 42 行
000034.txt 合并完成，共添加 42 行
000035.txt 合并完成，共添加 42 行
000036.txt 合并完成，共添加 42 行
000037.txt 合并完成，共添加 42 行
000038.txt 合并完成，共添加 42 行
000039.txt 合并完成，共添加 42 行
000040.txt 合并完成，共添加 42 行
000041.txt 合并完成，共添加 42 行
000042.txt 合并完成，共添加 42 行
000043.txt 合并完成，共添加 42 行
000044.txt 合并完成，共添加 42 行
000045.txt 合并完成，共添加 42 行
000046.txt 合并完成，共添加 42 行
000047.txt 合并完成，共添加 41 行
合并文件已保存至: C:\Users\cuiwe\Desktop\212\从28到47.txt


In [None]:
# ====================== 用户设定区 ======================

# 输入文件夹路径
input_folder_path = Path(r'C:\Users\cuiwe\Desktop\212')

# 输入文件名（例如：'从1到14.txt'）
input_file_name = '从1到14.txt'

# 输出文件名（例如：'从1到14_converted.txt'）
output_file_name = '从1到14_加工完.txt'

# ==========================================================

input_file_path = input_folder_path / input_file_name
output_file_path = input_folder_path / output_file_name

# $ 顺序插入 E / S 辅助函数
def insert_E_S_around_dollars(text):
    result = []
    flag_dollar = 2  # 循环标记：1~4

    for char in text:
        if char == '$':
            flag_dollar += 1
            if flag_dollar > 4:
                flag_dollar = 1

            if flag_dollar == 1:
                result.append('E$')
            elif flag_dollar == 4:
                result.append('$S')
            else:
                result.append('$')
        else:
            result.append(char)

    return ''.join(result)

try:
    with input_file_path.open('r', encoding='utf-8') as file:
        content = file.read()

    converted_content = content.replace('　', '+')
    # 全角空格 → + 完成

    converted_content = converted_content.replace('<正文>', '$<正文>$')
    converted_content = converted_content.replace('<正文/>', '$<正文/>$')
    # <正文> 标签加套 $ 完成

    converted_content = converted_content.replace('<引书>', 'E<引书>')
    converted_content = converted_content.replace('<注释>', 'E<注释>')
    converted_content = converted_content.replace('</引书>', '</引书>S')
    converted_content = converted_content.replace('</注释>', '</注释>S')
    # 引书 / 注释 标签插入 E / S 完成

    converted_content = converted_content.replace('+', 'ES')
    # + 转换为 ES 完成

    converted_content = converted_content.replace('<页码>', '')
    converted_content = converted_content.replace('</页码>', '')
    # <页码> 标签删除完成

    converted_content = insert_E_S_around_dollars(converted_content)
    # 根据 $ 的顺序插入 E / S 完成

    page_number = 1

    def insert_page_number(match):
        global page_number
        page_number += 1
        insertion = f"\np{page_number}\n"
        return match.group(1) + insertion + match.group(2)

    pattern = re.compile(r'(\$<正文/>\$)\s*\uFEFF*(\$<正文>\$)')
    converted_content = pattern.sub(insert_page_number, converted_content)

    # 插入页码完成

    with output_file_path.open('w', encoding='utf-8') as file:
        file.write(converted_content)

    print(f"结果文件保存完成: {output_file_path}")

except Exception as e:
    print(f"处理过程中发生错误: {e}")

# ====================== 检验部分 ======================

try:
    with output_file_path.open('r', encoding='utf-8') as file:
        final_content = file.read()

    p_numbers = re.findall(r'p(\d+)', final_content)
    p_numbers = [int(num) for num in p_numbers]
    if 1 not in p_numbers:
        p_numbers = [1] + p_numbers
    print(f'页码编号列表 ({len(p_numbers)} 个): {p_numbers}')
    zhongfeng_blocks = re.findall(r'<中缝>(.*?)<中缝/>', final_content, re.DOTALL)
    # <中缝> 区块数量：{len(zhongfeng_blocks)}

    zhongfeng_numbers = []
    for idx in range(0, len(zhongfeng_blocks), 2):
        if idx + 1 < len(zhongfeng_blocks):
            raw_number = zhongfeng_blocks[idx + 1].strip().replace('\n', '')
            try:
                page_num = cn2an.cn2an(raw_number, mode="smart")
                zhongfeng_numbers.append(page_num)
            except Exception as e:
                print(f'中缝数字转换失败: {raw_number} → {e}')
                zhongfeng_numbers.append(None)

    print(f'中缝页码列表 ({len(zhongfeng_numbers)} 个): {zhongfeng_numbers}')

    for idx, (p_num, zf_num) in enumerate(zip(p_numbers, zhongfeng_numbers), 1):
        if zf_num is None:
            print(f'p{p_num}: 中缝编号转换失败')
        elif p_num != zf_num:
            print(f'页码不一致: p{p_num} != 中缝 {zf_num} (索引 {idx})')

    # print('进行额外检查')

    lines = final_content.splitlines()
    for idx, line in enumerate(lines):
        page_match = re.match(r'p(\d+)', line.strip())
        if page_match:
            page_no = int(page_match.group(1))
            prev_line = lines[idx - 1] if idx > 0 else ''
            next_line = lines[idx + 1] if idx + 1 < len(lines) else ''
            if '$<正文/>$' in prev_line and '$<正文>$' in next_line:
                upper_tag = any(tag in lines[idx - 2] for tag in ['<引书>', '<注释>', '</引书>', '</注释>']) if idx - 2 >= 0 else False
                lower_tag = any(tag in lines[idx + 2] for tag in ['<引书>', '<注释>', '</引书>', '</注释>']) if idx + 2 < len(lines) else False
                if upper_tag and lower_tag:
                    print(f'p{page_no} 附近同时出现了 引书/注释 标签')

    print('检验完成')

except Exception as e:
    print(f'检验过程中发生错误: {e}')

结果文件保存完成: C:\Users\cuiwe\Desktop\212\从1到14_加工完.txt
页码编号列表 (14 个): [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
中缝页码列表 (14 个): [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
检验完成


In [None]:
# ————————————————————————————用户输入栏————————————————————————————
file_path = r'C:\Users\cuiwe\Desktop\212\从1到14_加工完.txt'
output_path = r'C:\Users\cuiwe\Desktop\212\从1到14识别结果.txt'
# 页面编号信息
title_level = 'h4'
page_number = 1
title_id = 40
doc_id = 1
# ————————————————————————————用户输入栏————————————————————————————


result = []
in_paragraph = False
append_enabled = False
flag_append = True        # 用于检测 $ 标签的插入开关
flag_append_counter = 2   # 统计 $ 出现的次数
flag_A_B = 'B'            # 页面类型标记（初始为 B）
current_text = ""
flag_kuaye = False        # 跨页标记（初始为 False）

with open(file_path, 'r', encoding='utf-8-sig') as file:
    data = file.read()

    i = 0
    while i < len(data):
        char = data[i]

        # 检测页码
        if char == 'p':
            page_match = re.match(r'p(\d+)', data[i:i+10])
            if page_match:
                page_number = int(page_match.group(1))
                i += len(page_match.group(0))
                continue

        # 检测 $ 标签并控制插入状态，同时切换页面类型
        if char == '$':
            flag_append_counter += 1

            if flag_append_counter == 1:
                flag_append = False

            elif flag_append_counter == 4:
                flag_append = True
                flag_append_counter = 0

                # A 和 B 页面类型切换
                if flag_A_B == 'B':
                    flag_A_B = 'A'
                else:
                    flag_A_B = 'B'

            i += 1
            continue

        # 处理引书标签
        if data[i:i+4] == '<引书>':
            end_tag = data.find('</引书>', i)
            if end_tag != -1:
                quote_content = data[i+4:end_tag]
                result.append({
                    "text_type": "引书",
                    "full_text": quote_content.strip().replace('\n', '').replace('　', ''),
                    "full_text_order": len(result) + 1,
                    "title_level": title_level,
                    "title_id": title_id,
                    "related_id": None,
                    "doc_id": doc_id,
                    "page_number": page_number,
                    "page_type": flag_A_B
                })
                i = end_tag + len('</引书>')
                continue

        # 处理注释标签
        if data[i:i+4] == '<注释>':
            end_tag = data.find('</注释>', i)
            if end_tag != -1:
                comment_content = data[i+4:end_tag]
                result.append({
                    "text_type": "注疏",
                    "full_text": comment_content.strip().replace('\n', '').replace('　', ''),
                    "full_text_order": len(result) + 1,
                    "title_level": title_level,
                    "title_id": title_id,
                    "related_id": None,
                    "doc_id": doc_id,
                    "page_number": page_number,
                    "page_type": flag_A_B
                })
                i = end_tag + len('</注释>')
                continue

        # 段落开始
        if char == 'S' and not in_paragraph:
            flag_kuaye = False
            in_paragraph = True
            append_enabled = True
            current_text = ""
            i += 1
            continue

        # 段落结束
        if char == 'E' and in_paragraph:
            # 检测跨页标记
            if data[i:i+8].find('$<正文/>$') != -1 and data[i - 2] != '。':
                flag_kuaye = True
                
            in_paragraph = False
            append_enabled = False
            clean_text = current_text.strip().replace('\n', '').replace('　', '')
            if clean_text:
                result.append({
                    "text_type": "引文",
                    "full_text": clean_text,
                    "full_text_order": len(result) + 1,
                    "title_level": title_level,
                    "title_id": title_id,
                    "related_id": None,
                    "doc_id": doc_id,
                    "page_number": page_number,
                    "page_type": flag_A_B,
                    "flag_kuaye": flag_kuaye
                })
            current_text = ""
            i += 1
            continue

        # 满足插入条件时追加字符
        if in_paragraph and append_enabled and flag_append:
            current_text += char

        i += 1

# 最后，处理跨页断句情况
cleaned_result = []
i = 0

while i < len(result):
    current_item = result[i]

    if not current_item.get("full_text", "").strip():
        i += 1
        continue

    if current_item.get("flag_kuaye") and i + 1 < len(result):
        next_item = result[i + 1]
        next_text = next_item.get("full_text", "")
        split_pos = next_text.find("。")

        if split_pos != -1:
            to_move = next_text[:split_pos + 1]
            current_item["full_text"] += to_move
            next_item["full_text"] = next_text[split_pos + 1:]

    cleaned_result.append(current_item)
    i += 1

for new_order, item in enumerate(cleaned_result, start=1):
    item["full_text_order"] = new_order

# 写入输出文件
with open(output_path, 'w', encoding='utf-8') as f:
    f.write(json.dumps(cleaned_result, ensure_ascii=False, indent=4))


In [None]:
# 引书目录检查，一般不会出错，只是为了看引书目录
file_path = r'C:\Users\cuiwe\Desktop\212\从1到14_加工完.txt'

yinshu_from_raw = []

with open(file_path, 'r', encoding='utf-8') as file:
    data = file.read()

pattern = re.compile(r'<引书>(.*?)</引书>', re.DOTALL)
matches = pattern.findall(data)

for match in matches:
    cleaned = match.strip().replace('\n', '').replace('　', '')
    yinshu_from_raw.append(cleaned)

yinshu_from_result = [
    item["full_text"].strip().replace('\n', '').replace('　', '')
    for item in result
    if item.get("text_type") == "引书"
]

max_len = max(len(yinshu_from_raw), len(yinshu_from_result))
yinshu_from_raw += [None] * (max_len - len(yinshu_from_raw))
yinshu_from_result += [None] * (max_len - len(yinshu_from_result))

df = pd.DataFrame({
    "识别前": yinshu_from_raw,
    "识别后": yinshu_from_result
})

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

display(df)

print("原始文本中 引书 总数:", len([x for x in yinshu_from_raw if x is not None]))
print("代码识别结果 中 引书 总数:", len([x for x in yinshu_from_result if x is not None]))


Unnamed: 0,识别前,识别后
0,宋黄氏日抄,宋黄氏日抄
1,家則堂先生瀛州集,家則堂先生瀛州集
2,元楊志行霜月齋集,元楊志行霜月齋集
3,唐白居易詩,唐白居易詩
4,薛榮詩,薛榮詩
5,方玄英詩,方玄英詩
6,劉文房詩,劉文房詩
7,陸龜蒙詩,陸龜蒙詩
8,常建詩,常建詩
9,宋宼凖巴東集,宋宼凖巴東集


原始文本中 引书 总数: 70
代码识别结果 中 引书 总数: 70


In [None]:
def insert_documents_from_result(result_file_path, connection):
    with open(result_file_path, 'r', encoding='utf-8') as f:
        result = json.load(f)

    try:
        with connection.cursor() as cursor:
            for item in result:
                if item.get("text_type") == "引书":
                    # 整理书名
                    book_title = item.get('full_text').strip()
                    book_title_wrapped = f"《{book_title}》"
                    doc_id_to_add = str(item.get("doc_id"))

                    # 使用 MATCH AGAINST 查询文献是否已存在（包含 doc_origin_id）
                    cursor.execute(
                        """
                        SELECT doc_id, doc_origin_id FROM Documents
                        WHERE MATCH(doc_title) AGAINST (%s IN BOOLEAN MODE)
                        """,
                        (f'"{book_title}"',)
                    )
                    existing = cursor.fetchone()

                    if existing:
                        this_doc_id = existing["doc_id"]
                        this_origin = existing.get("doc_origin_id")

                        # 将 origin 处理为列表，检查是否已包含 doc_id_to_add
                        if this_origin and this_origin.strip():
                            origin_list = [x.strip() for x in this_origin.split(',') if x.strip()]
                            if doc_id_to_add in origin_list:
                                print(f"{book_title_wrapped} ➤ 已存在，origin_id 中已包含 {doc_id_to_add}")
                            else:
                                origin_list.append(doc_id_to_add)
                                new_origin = ','.join(origin_list)
                                cursor.execute(
                                    """
                                    UPDATE Documents
                                    SET doc_origin_id = %s
                                    WHERE doc_id = %s
                                    """,
                                    (new_origin, this_doc_id)
                                )
                                print(f"{book_title_wrapped} ➤ 已存在，已将 {doc_id_to_add} 添加到 origin_id")
                        else:
                            # 如果 origin_id 是 None 或空字符串，则重新设置
                            cursor.execute(
                                """
                                UPDATE Documents
                                SET doc_origin_id = %s
                                WHERE doc_id = %s
                                """,
                                (doc_id_to_add, this_doc_id)
                            )
                            print(f"{book_title_wrapped} ➤ 已存在，origin_id 设置为 {doc_id_to_add}")

                    else:
                        # 插入新文献记录
                        cursor.execute(
                            """
                            INSERT INTO Documents (doc_title, doc_type, doc_origin_id)
                            VALUES (%s, %s, %s)
                            """,
                            (book_title_wrapped, False, doc_id_to_add)
                        )
                        print(f"{book_title_wrapped} ➤ 新文献插入完成（origin_id={doc_id_to_add}）")

        connection.commit()

    finally:
        connection.close()


In [None]:
def insert_full_text_from_result(file_path, conn):
    cursor = conn.cursor()

    # 从文件中加载 result 数据
    with open(file_path, 'r', encoding='utf-8') as f:
        result = json.load(f)

    # 插入 SQL 语句
    insert_query = '''
    INSERT INTO full_text_1
    (full_text, full_text_order, title_level, title_id, text_type, related_id, quote_loc, doc_id, page_number, page_type)
    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    '''

    data_to_insert = []
    for entry in result:
        data_tuple = (
            entry.get('full_text'),
            entry.get('full_text_order'),
            entry.get('title_level'),
            entry.get('title_id'),
            entry.get('text_type'),
            entry.get('related_id', None),
            entry.get('quote_loc', None),
            entry.get('doc_id'),
            entry.get('page_number', None),
            entry.get('page_type', None)
        )
        data_to_insert.append(data_tuple)

    # 执行插入操作
    try:
        cursor.executemany(insert_query, data_to_insert)
        conn.commit()
        print(f"{cursor.rowcount} 条数据已成功插入 full_text_1 表。")
    except Exception as e:
        conn.rollback()
        print(f"发生错误: {e}")
    finally:
        cursor.close()

In [None]:
def update_full_text_relationships_from_result(file_path, conn):
    # 从文件中读取 title_id
    with open(file_path, 'r', encoding='utf-8') as f:
        result = json.load(f)
    title_id = result[0].get("title_id")

    try:
        with conn.cursor(pymysql.cursors.DictCursor) as cursor:
            # 1. 查询 Full_Text_1 中该 title_id 的所有数据
            cursor.execute("""
                SELECT full_text_id, text_type
                FROM Full_Text_1
                WHERE title_id = %s
            """, (title_id,))
            rows = cursor.fetchall()
            print(f"共获取数据 {len(rows)} 条\n")

            # 2. 跟踪最后出现的 引书 和 引文 的 ID
            last_yinshu_id = None
            last_yinwen_id = None
            update_count = 0

            # 3. 遍历数据并设置 related_id
            for row in rows:
                full_text_id = row['full_text_id']
                text_type = row['text_type']
                related_id = None

                if text_type == '引书':
                    last_yinshu_id = full_text_id
                    print(f"发现引书 → ID {full_text_id}（related_id 不变）")
                    continue

                elif text_type == '引文':
                    if last_yinshu_id is not None:
                        related_id = last_yinshu_id
                        last_yinwen_id = full_text_id
                        print(f"引文 → 关联到引书 ID {related_id}")
                    else:
                        print(f"警告：没有引书情况下出现引文！ID {full_text_id}")
                        continue

                elif text_type == '注疏':
                    if last_yinwen_id is not None:
                        related_id = last_yinwen_id
                        print(f"注疏 → 关联到引文 ID {related_id}")
                    else:
                        print(f"警告：没有引文情况下出现注疏！ID {full_text_id}")
                        continue

                if related_id is not None:
                    cursor.execute("""
                        UPDATE Full_Text_1
                        SET related_id = %s
                        WHERE full_text_id = %s
                    """, (related_id, full_text_id))
                    update_count += 1

            conn.commit()
            print(f"\n共更新 {update_count} 条 related_id 字段。")

    finally:
        conn.close()

    print("所有关联设置已完成。")

In [None]:
def insert_pages_from_result(file_path, conn):
    cursor = conn.cursor(pymysql.cursors.DictCursor)

    # 从文件中读取 title_id
    with open(file_path, 'r', encoding='utf-8') as f:
        result = json.load(f)
    title_id = result[0].get("title_id")

    # 查询 full_text_1 中各页的 full_text_id 列表
    select_query = '''
    SELECT 
        doc_id,
        title_id,
        page_number,
        page_type,
        GROUP_CONCAT(full_text_id ORDER BY full_text_id ASC) AS full_text_id_list
    FROM 
        full_text_1
    WHERE
        title_id = %s
    GROUP BY
        doc_id, title_id, page_number, page_type
    ORDER BY
        doc_id ASC, title_id ASC, page_number ASC, page_type ASC;
    '''

    cursor.execute(select_query, (title_id,))
    rows = cursor.fetchall()

    # 构建插入 pages 表的数据
    pages_data = []

    for row in rows:
        pages_data.append((
            row['doc_id'],
            row['full_text_id_list'],
            row['page_number'],
            row['page_type'],
            None,              # page_image 暂无
            datetime.now(),    # create_time
            row['title_id']
        ))

        print(f"doc_id {row['doc_id']}, page_number {row['page_number']}, page_type {row['page_type']}, title_id {row['title_id']}, 包含 full_text_id: {row['full_text_id_list']}")

    # 插入到 Pages 表中
    insert_query = '''
    INSERT INTO pages
    (doc_id, full_text_id_list, page_number, page_type, page_image, create_time, title_id)
    VALUES (%s, %s, %s, %s, %s, %s, %s)
    '''

    try:
        cursor.executemany(insert_query, pages_data)
        conn.commit()
        print(f"{cursor.rowcount} 条数据已插入 pages 表。")
    except Exception as e:
        conn.rollback()
        print(f"发生错误: {e}")
    finally:
        cursor.close()


In [None]:
def run_all_insert_steps(file_path, conn):
    insert_documents_from_result(file_path, conn)
    insert_full_text_from_result(file_path, conn)
    update_full_text_relationships_from_result(file_path, conn)
    insert_pages_from_result(file_path, conn)
    return

In [None]:
file_path = r'C:\Users\cuiwe\Desktop\212\从1到14识别结果.txt'

conn = pymysql.connect(
    host='localhost',
    user='root',
    password='park1947',
    database='leishu_yongle',
    charset='utf8mb4',
    cursorclass=pymysql.cursors.DictCursor
)

run_all_insert_steps(file_path, conn)
conn.close()

《宋黄氏日抄》 ➤ 이미 존재, origin_id에 1 포함됨
《家則堂先生瀛州集》 ➤ 이미 존재, origin_id에 1 포함됨
《元楊志行霜月齋集》 ➤ 이미 존재, origin_id에 1 포함됨
《唐白居易詩》 ➤ 이미 존재, origin_id에 1 포함됨
《薛榮詩》 ➤ 이미 존재, origin_id에 1 포함됨
《方玄英詩》 ➤ 이미 존재, origin_id에 1 포함됨
《劉文房詩》 ➤ 이미 존재, origin_id에 1 포함됨
《陸龜蒙詩》 ➤ 이미 존재, origin_id에 1 포함됨
《常建詩》 ➤ 이미 존재, origin_id에 1 포함됨
《宋宼凖巴東集》 ➤ 이미 존재, origin_id에 1 포함됨
《文潞公集》 ➤ 이미 존재, origin_id에 1 포함됨
《晏元獻公集》 ➤ 이미 존재, origin_id에 1 포함됨
《景文公集》 ➤ 이미 존재, origin_id에 1 포함됨
《程明道集》 ➤ 이미 존재, origin_id에 1 포함됨
《蔡端明集》 ➤ 이미 존재, origin_id에 1 포함됨
《劉忠肅公》 ➤ 이미 존재, origin_id에 1 포함됨
《歐陽公》 ➤ 이미 존재, origin_id에 1 포함됨
《蘇東坡詩》 ➤ 이미 존재, origin_id에 1 포함됨
《蘇頴濵集》 ➤ 이미 존재, origin_id에 1 포함됨
《陳后山詩》 ➤ 이미 존재, origin_id에 1 포함됨
《魏鶴山大全集》 ➤ 이미 존재, origin_id에 1 포함됨
《強幾聖詩》 ➤ 이미 존재, origin_id에 1 포함됨
《張嵲詩》 ➤ 이미 존재, origin_id에 1 포함됨
《韓維南陽集》 ➤ 이미 존재, origin_id에 1 포함됨
《郭印雲溪集》 ➤ 이미 존재, origin_id에 1 포함됨
《崔敦禮舎人集》 ➤ 이미 존재, origin_id에 1 포함됨
《鄭獬鄖溪集》 ➤ 이미 존재, origin_id에 1 포함됨
《李姑叔姑溪集》 ➤ 이미 존재, origin_id에 1 포함됨
《李壁雁湖集》 ➤ 이미 존재, origin_id에 1 포함됨
《武朝宗適安蔵拙餘藁》 ➤ 이미 존재, ori