#### **1. 导入模块**

导入 Python 标准库和本项目自定义库

In [1]:
# 标准库
import os
import sys
import time

# 将上级目录加入系统路径
# 以便导入项目自定义库
sys.path.append(os.path.abspath('..'))

# 自定义库
from src.utils import *
from src.annotator.pos_tagger import *

#### **2. 读取语料**

读取 TSV 格式的平行语料库

In [2]:
# 语料库文件所在目录
data_dir = '../data/raw'

# 读取数据
# limit 参数控制平行句对数量
data = load_data(data_dir, limit=10)
print(f"成功读取 {len(data)} 条平行句对")

# 预览数据
# 第一列：汉语原文，选自《鹿鼎记》
# 第二列：英语译文，选自《The Deer and The Cauldron》（闵福德译）
print("数据前 5 行如下：")
data.head()

成功读取 10 条平行句对
数据前 5 行如下：


Unnamed: 0,source,target
1,江南近海滨的一条大路上，一队清兵手执刀枪，押着七辆囚车，冲风冒寒，向北而行。,Along a coastal road somewhere south of the Ya...
2,前面三辆囚车中分别监禁的是三个男子，都作书生打扮，一个是白发老者，两个是中年人。,In each of the first three carts a single male...
3,后面四辆囚车中坐的是女子，最后一辆囚车中是个少妇，怀中抱着个女婴。,"The four rear carts were occupied by women, th..."
4,女婴啼哭不休。 她母亲温言相呵，女婴只是大哭。,The little girl was crying in a continuous wai...
5,囚车旁一清兵恼了，伸腿在车上踢了一脚，喝道：“再哭，再哭！,"One of the soldiers marching alongside, irrita..."


#### **3. 大模型英文词性标注：UD 赋码集**

In [3]:
# === 加载模型：LLM UD 赋码集 ===

# 语种：英语
lang = 'english'

# 赋码集：UD
# https://universaldependencies.org/u/pos/index.html
tagset = 'ud'

# 标注模式：LLM API
mode = 'llm'

# 大模型：deepseek-v3.2 | glm-4.7 | qwen3-max
llm_model = 'deepseek-v3.2'

# 登录阿里云百炼平台：https://bailian.console.aliyun.com/
# 申请调用大模型服务的 API 账号
# 并在 llm_corpus_annotation/config 文件中设置 LLM_API_KEY=sk-********
t0 = time.time()
llm_en_ud_tagger = POSTagger(
    lang=lang,
    tagset=tagset,
    mode=mode,
    llm_model=llm_model,
)
t1 = time.time()
print('LLM UD 英文词性标注模型加载完毕！')
print(f'加载耗时：{t1-t0:.2f}s')

LLM UD 英文词性标注模型加载完毕！
加载耗时：2.17s


In [4]:
# === 标注译文：LLM UD 赋码集 ===

# 注意：
# 为节省 API 调用成本
# 大模型生成内容保存于本地缓存 data/llm_cache
# 完成首次调用后，再次调用只需从本地数据库读取生成结果
# 因此，再次运行程序，时间显示为 0.0 秒
llm_en_ud_annos = annotate_data(
    data['target'],
    llm_en_ud_tagger,
)

# 打印标注结果
print("\n=== LLM UD 英文词性标注结果预览 ===")
display_anno(llm_en_ud_annos[0])

POS Tagging: 100%|████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 737.63it/s]


=== LLM UD 英文词性标注结果预览 ===

[ID]: 00001
Along a coastal road somewhere south of the Yangtze River, a detachment of soldiers, each of them armed with a halberd, was escorting a line of seven prison carts, trudging northwards in the teeth of a bitter wind.
--------------------------------------------------------------------------------
[('Along', 'ADP'), ('a', 'DET'), ('coastal', 'ADJ'), ('road', 'NOUN'), ('somewhere', 'ADV'), ('south', 'ADV'), ('of', 'ADP'), ('the', 'DET'), ('Yangtze', 'PROPN'), ('River', 'PROPN'), (',', 'PUNCT'), ('a', 'DET'), ('detachment', 'NOUN'), ('of', 'ADP'), ('soldiers', 'NOUN'), (',', 'PUNCT'), ('each', 'PRON'), ('of', 'ADP'), ('them', 'PRON'), ('armed', 'VERB'), ('with', 'ADP'), ('a', 'DET'), ('halberd', 'NOUN'), (',', 'PUNCT'), ('was', 'AUX'), ('escorting', 'VERB'), ('a', 'DET'), ('line', 'NOUN'), ('of', 'ADP'), ('seven', 'NUM'), ('prison', 'NOUN'), ('carts', 'NOUN'), (',', 'PUNCT'), ('trudging', 'VERB'), ('northwards', 'ADV'), ('in', 'ADP'), ('the', 'DET'), 




#### **4. 大模型英文词性标注：PTB 赋码集**

In [5]:
# === 加载模型：LLM PTB 赋码集 ===

# 语种：英语
lang = 'english'

# 赋码集：PTB
# https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
tagset = 'ptb'

# 标注模式：LLM API
mode = 'llm'

# 大模型：deepseek-v3.2 | glm-4.7 | qwen3-max
llm_model = 'deepseek-v3.2'

# 登录阿里云百炼平台：https://bailian.console.aliyun.com/
# 申请调用大模型服务的 API 账号
# 并在 llm_corpus_annotation/config 文件中设置 LLM_API_KEY=sk-********
t0 = time.time()
llm_en_ptb_tagger = POSTagger(
    lang=lang,
    tagset=tagset,
    mode=mode,
    llm_model=llm_model,
)
t1 = time.time()
print('LLM PTB 英文词性标注模型加载完毕！')
print(f'加载耗时：{t1-t0:.2f}s')

LLM PTB 英文词性标注模型加载完毕！
加载耗时：1.04s


In [6]:
# === 标注译文：LLM PTB 赋码集 ===

# 注意：
# 为节省 API 调用成本
# 大模型生成内容保存于本地缓存 data/llm_cache
# 完成首次调用后，再次调用只需从本地数据库读取生成结果
# 因此，再次运行程序，时间显示为 0.0 秒
llm_en_ptb_annos = annotate_data(
    data['target'],
    llm_en_ptb_tagger,
)

# 打印标注结果
print("\n=== LLM PTB 英文词性标注结果预览 ===")
display_anno(llm_en_ptb_annos[0])

POS Tagging: 100%|████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 826.01it/s]


=== LLM PTB 英文词性标注结果预览 ===

[ID]: 00001
Along a coastal road somewhere south of the Yangtze River, a detachment of soldiers, each of them armed with a halberd, was escorting a line of seven prison carts, trudging northwards in the teeth of a bitter wind.
--------------------------------------------------------------------------------
[('Along', 'IN'), ('a', 'DT'), ('coastal', 'JJ'), ('road', 'NN'), ('somewhere', 'RB'), ('south', 'RB'), ('of', 'IN'), ('the', 'DT'), ('Yangtze', 'NNP'), ('River', 'NNP'), (',', ','), ('a', 'DT'), ('detachment', 'NN'), ('of', 'IN'), ('soldiers', 'NNS'), (',', ','), ('each', 'DT'), ('of', 'IN'), ('them', 'PRP'), ('armed', 'VBN'), ('with', 'IN'), ('a', 'DT'), ('halberd', 'NN'), (',', ','), ('was', 'VBD'), ('escorting', 'VBG'), ('a', 'DT'), ('line', 'NN'), ('of', 'IN'), ('seven', 'CD'), ('prison', 'NN'), ('carts', 'NNS'), (',', ','), ('trudging', 'VBG'), ('northwards', 'RB'), ('in', 'IN'), ('the', 'DT'), ('teeth', 'NN'), ('of', 'IN'), ('a', 'DT'), ('bitter', 




#### **5. 大模型英文词性标注：CLAWS7 赋码集**

In [7]:
# === 加载模型：LLM CLAWS7 赋码集 ===

# 语种：英语
lang = 'english'

# 赋码集：CLAWS7
# https://ucrel.lancs.ac.uk/claws7tags.html
tagset = 'claws'

# 标注模式：LLM API
mode = 'llm'

# 大模型：deepseek-v3.2 | glm-4.7 | qwen3-max
llm_model = 'deepseek-v3.2'

# 登录阿里云百炼平台：https://bailian.console.aliyun.com/
# 申请调用大模型服务的 API 账号
# 并在 llm_corpus_annotation/config 文件中设置 LLM_API_KEY=sk-********
t0 = time.time()
llm_en_claws_tagger = POSTagger(
    lang=lang,
    tagset=tagset,
    mode=mode,
    llm_model=llm_model,
)
t1 = time.time()
print('LLM CLAWS7 英文词性标注模型加载完毕！')
print(f'加载耗时：{t1-t0:.2f}s')

LLM CLAWS7 英文词性标注模型加载完毕！
加载耗时：1.05s


In [8]:
# === 标注译文：LLM CLAWS7 赋码集 ===

# 注意：
# 为节省 API 调用成本
# 大模型生成内容保存于本地缓存 data/llm_cache
# 完成首次调用后，再次调用只需从本地数据库读取生成结果
# 因此，再次运行程序，时间显示为 0.0 秒
llm_en_claws_annos = annotate_data(
    data['target'],
    llm_en_claws_tagger,
)

# 打印标注结果
print("\n=== LLM CLAWS7 英文词性标注结果预览 ===")
display_anno(llm_en_claws_annos[0])

POS Tagging: 100%|███████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 2051.21it/s]


=== LLM CLAWS7 英文词性标注结果预览 ===

[ID]: 00001
Along a coastal road somewhere south of the Yangtze River, a detachment of soldiers, each of them armed with a halberd, was escorting a line of seven prison carts, trudging northwards in the teeth of a bitter wind.
--------------------------------------------------------------------------------
[('Along', 'II'), ('a', 'AT1'), ('coastal', 'JJ'), ('road', 'NN1'), ('somewhere', 'RL'), ('south', 'ND1'), ('of', 'IO'), ('the', 'AT'), ('Yangtze', 'NP1'), ('River', 'NNL1'), (',', ','), ('a', 'AT1'), ('detachment', 'NN1'), ('of', 'IO'), ('soldiers', 'NN2'), (',', ','), ('each', 'PN1'), ('of', 'IO'), ('them', 'PPHO2'), ('armed', 'VVN'), ('with', 'IW'), ('a', 'AT1'), ('halberd', 'NN1'), (',', ','), ('was', 'VBDZ'), ('escorting', 'VVG'), ('a', 'AT1'), ('line', 'NN1'), ('of', 'IO'), ('seven', 'MC'), ('prison', 'NN1'), ('carts', 'NN2'), (',', ','), ('trudging', 'VVG'), ('northwards', 'RL'), ('in', 'II'), ('the', 'AT'), ('teeth', 'NN2'), ('of', 'IO'), ('a',




#### **6. 英文词性标注模型对比：LLM & Spacy**

In [9]:
# === 加载模型：Spacy UD 赋码集 ===

# 语种：英语
lang = 'english'

# 赋码集：UD
# https://universaldependencies.org/u/pos/index.html
tagset = 'ud'

# 标注模式：Spacy 本地模型
mode = 'local'

# Spacy UD 模型：en_core_web_trf
# https://spacy.io/models/en#en_core_web_trf
# 首次加载需从网络将模型下载至本地
t0 = time.time()
spay_en_ud_tagger = POSTagger(
    lang=lang,
    tagset=tagset,
    mode=mode,
)
t1 = time.time()
print('Spacy UD 英文词性标注模型加载完毕！')
print(f'加载耗时：{t1-t0:.2f}s')

Spacy UD 英文词性标注模型加载完毕！
加载耗时：7.53s


In [10]:
# === 标注译文：Spacy UD 赋码集 ===

spacy_en_ud_annos = annotate_data(
    data['target'],
    spay_en_ud_tagger,
)

# 打印标注结果
print("\n===Spacy UD 英文词性标注结果预览 ===")
display_anno(spacy_en_ud_annos[0])

POS Tagging: 100%|█████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 16.70it/s]


===Spacy UD 英文词性标注结果预览 ===

[ID]: 00001
Along a coastal road somewhere south of the Yangtze River, a detachment of soldiers, each of them armed with a halberd, was escorting a line of seven prison carts, trudging northwards in the teeth of a bitter wind.
--------------------------------------------------------------------------------
[('Along', 'ADP'), ('a', 'DET'), ('coastal', 'ADJ'), ('road', 'NOUN'), ('somewhere', 'ADV'), ('south', 'ADV'), ('of', 'ADP'), ('the', 'DET'), ('Yangtze', 'PROPN'), ('River', 'PROPN'), (',', 'PUNCT'), ('a', 'DET'), ('detachment', 'NOUN'), ('of', 'ADP'), ('soldiers', 'NOUN'), (',', 'PUNCT'), ('each', 'DET'), ('of', 'ADP'), ('them', 'PRON'), ('armed', 'ADJ'), ('with', 'ADP'), ('a', 'DET'), ('halberd', 'NOUN'), (',', 'PUNCT'), ('was', 'AUX'), ('escorting', 'VERB'), ('a', 'DET'), ('line', 'NOUN'), ('of', 'ADP'), ('seven', 'NUM'), ('prison', 'NOUN'), ('carts', 'NOUN'), (',', 'PUNCT'), ('trudging', 'VERB'), ('northwards', 'ADV'), ('in', 'ADP'), ('the', 'DET'), (




In [11]:
# === 模型对比：LLM UD & Spacy UD ===

print('英文词性标注模型对比（UD 赋码集）：LLM & Spacy')
compare_annos(
    llm_en_ud_annos,
    spacy_en_ud_annos,
    annos_1_name="LLM",
    annos_2_name="Spacy",
)

英文词性标注模型对比（UD 赋码集）：LLM & Spacy

[ID]: 00001
Along a coastal road somewhere south of the Yangtze River, a detachment of soldiers, each of them armed with a halberd, was escorting a line of seven prison carts, trudging northwards in the teeth of a bitter wind.
--------------------------------------------------------------------------------
LLM: {('armed', 'VERB'), ('each', 'PRON')}
Spacy: {('each', 'DET'), ('armed', 'ADJ')}

[ID]: 00002
In each of the first three carts a single male prisoner was caged, identifiable by his dress as a member of the scholar class. One was a white-haired old man. The other two were men of middle years.
--------------------------------------------------------------------------------
LLM: {('One', 'PRON')}
Spacy: {('One', 'NUM')}

[ID]: 00003
The four rear carts were occupied by women, the last of them by a young mother holding a baby girl at her breast.
--------------------------------------------------------------------------------
LLM: {('last', 'NOUN')}
Sp

In [12]:
# === 加载模型：Spacy PTB 赋码集 ===

# 语种：英语
lang = 'english'

# 赋码集：PTB
# https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
tagset = 'ptb'

# 标注模式：Spacy 本地模型
mode = 'local'

# Spacy PTB 模型：en_core_web_trf
# https://spacy.io/models/en#en_core_web_trf
# 首次加载需从网络将模型下载至本地
t0 = time.time()
spay_en_ptb_tagger = POSTagger(
    lang=lang,
    tagset=tagset,
    mode=mode,
)
t1 = time.time()
print('Spacy PTB 英文词性标注模型加载完毕！')
print(f'加载耗时：{t1-t0:.2f}s')

Spacy PTB 英文词性标注模型加载完毕！
加载耗时：2.55s


In [13]:
# === 标注译文：Spacy PTB 赋码集 ===

spacy_en_ptb_annos = annotate_data(
    data['target'],
    spay_en_ptb_tagger,
)

# 打印标注结果
print("\n===Spacy PTB 英文词性标注结果预览 ===")
display_anno(spacy_en_ptb_annos[0])

POS Tagging: 100%|█████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 19.42it/s]


===Spacy PTB 英文词性标注结果预览 ===

[ID]: 00001
Along a coastal road somewhere south of the Yangtze River, a detachment of soldiers, each of them armed with a halberd, was escorting a line of seven prison carts, trudging northwards in the teeth of a bitter wind.
--------------------------------------------------------------------------------
[('Along', 'IN'), ('a', 'DT'), ('coastal', 'JJ'), ('road', 'NN'), ('somewhere', 'RB'), ('south', 'RB'), ('of', 'IN'), ('the', 'DT'), ('Yangtze', 'NNP'), ('River', 'NNP'), (',', ','), ('a', 'DT'), ('detachment', 'NN'), ('of', 'IN'), ('soldiers', 'NNS'), (',', ','), ('each', 'DT'), ('of', 'IN'), ('them', 'PRP'), ('armed', 'JJ'), ('with', 'IN'), ('a', 'DT'), ('halberd', 'NN'), (',', ','), ('was', 'VBD'), ('escorting', 'VBG'), ('a', 'DT'), ('line', 'NN'), ('of', 'IN'), ('seven', 'CD'), ('prison', 'NN'), ('carts', 'NNS'), (',', ','), ('trudging', 'VBG'), ('northwards', 'RB'), ('in', 'IN'), ('the', 'DT'), ('teeth', 'NNS'), ('of', 'IN'), ('a', 'DT'), ('bitter',




In [14]:
# === 模型对比：LLM PTB & Spacy PTB ===

print('英文词性标注模型对比（PTB 赋码集）：LLM & Spacy')
compare_annos(
    llm_en_ptb_annos,
    spacy_en_ptb_annos,
    annos_1_name="LLM",
    annos_2_name="Spacy",
)

英文词性标注模型对比（PTB 赋码集）：LLM & Spacy

[ID]: 00001
Along a coastal road somewhere south of the Yangtze River, a detachment of soldiers, each of them armed with a halberd, was escorting a line of seven prison carts, trudging northwards in the teeth of a bitter wind.
--------------------------------------------------------------------------------
LLM: {('armed', 'VBN'), ('teeth', 'NN')}
Spacy: {('teeth', 'NNS'), ('armed', 'JJ')}

[ID]: 00002
In each of the first three carts a single male prisoner was caged, identifiable by his dress as a member of the scholar class. One was a white-haired old man. The other two were men of middle years.
--------------------------------------------------------------------------------
LLM: set()
Spacy: set()

[ID]: 00003
The four rear carts were occupied by women, the last of them by a young mother holding a baby girl at her breast.
--------------------------------------------------------------------------------
LLM: set()
Spacy: set()

[ID]: 00004
The little gi