#### **1. 导入模块**

导入 Python 标准库和本项目自定义库

In [1]:
# 标准库
import os
import sys

# 将上级目录加入系统路径
# 以便导入项目自定义库
sys.path.append(os.path.abspath('..'))

# 自定义库
from src.llm_client import LLMClient

#### **2. 加载模型**

加载大模型 API 接口

In [2]:
# 模型名称
# 可选模型：
# 大模型：kimi-k2.5 | glm-5 | deepseek-v3.2 | qwen3-max

model = 'kimi-k2.5'

# 采样温度系数
# 控制生成结果的多样性
# 取值越高，生成结果更多样
# 反之，生成结果更确定
# 取值范围：[0, 2)
# 默认系数：0.1

temperature = 0.1

# 是否开启思考模式
# 若开启：enable_thinking =True
# 模型会输出完整推理过程
# 生成更长文本，消耗更多 Token
# 默认模式：不开启

enable_thinking = False

# 初始化大模型 API 接口
# 登录阿里云百炼平台：https://bailian.console.aliyun.com/
# 申请调用大模型服务的 API-Key
# 并在 config 文件中设置 LLM_API_KEY=sk-********
# 新注册用户可免费调用部分模型的 API
# 登录后在模型服务页面查看免费模型列表

client = LLMClient(
    model=model,
    temperature=temperature,
    enable_thinking=enable_thinking,
)
print('LLM API 接口加载完毕！')

LLM API 接口加载完毕！


#### **3. 多语种分词**

大模型多语种分词：批量标注

In [3]:
# === 多语种分词提示词模版：占位符 + JSON 格式输出 ===

# 占位符 {lang}：动态填充指定语种
# 占位符 {text}：动态填充待标注文本

prompt_tmpl = """
You are a professional corpus linguist specialized in {lang} tokenization.

Your task is to tokenize the given text.
    
Return your result in a JSON list.

Text: {text}
"""

In [4]:
# === 中文测试数据 ===

# 选自《鹿鼎记》 

zh_text = """
江南近海滨的一条大路上，一队清兵手执刀枪，押着七辆囚车，冲风冒寒，向北而行。
前面三辆囚车中分别监禁的是三个男子，都作书生打扮，一个是白发老者，两个是中年人。
后面四辆囚车中坐的是女子，最后一辆囚车中是个少妇，怀中抱着个女婴。
女婴啼哭不休。 她母亲温言相呵，女婴只是大哭。
囚车旁一清兵恼了，伸腿在车上踢了一脚，喝道：“再哭，再哭！
老子踢死你！”
"""

In [5]:
# === 调用大模型 API：中文批量分词 ===

# --- 注意 ---
# 为节省 API 调用成本
# 大模型生成内容保存于本地缓存 data/llm_cache
# 完成首次调用后，再次调用只需从本地数据库读取生成结果

# 若需测试 API 连接是否正常
# 可更换测试数据，重新标注

print(f'=== 中文批量分词 ===\n')

# 逐行遍历所有数据
for index, text in enumerate(zh_text.strip().splitlines()):
    #print(f'[ID]: {index+1:05d}')
    #print(text)
    #print(f'{"-" * 60}')
    
    # 构建提示词
    prompt = prompt_tmpl.format(lang='Chinese', text=text.strip())
    
    # 调用大模型分词
    tokens = client.get_response(prompt=prompt, json_output=True)
    print(tokens)
    #print(f'{"=" * 60}')

=== 中文批量分词 ===

['江南', '近', '海滨', '的', '一条', '大路', '上', '，', '一队', '清兵', '手执', '刀枪', '，', '押着', '七辆', '囚车', '，', '冲风冒寒', '，', '向北', '而行', '。']
['前面', '三辆', '囚车', '中', '分别', '监禁', '的', '是', '三个', '男子', '，', '都', '作', '书生', '打扮', '，', '一个', '是', '白发', '老者', '，', '两个', '是', '中年人', '。']
['后面', '四辆', '囚车', '中', '坐', '的', '是', '女子', '，', '最后', '一辆', '囚车', '中', '是', '个', '少妇', '，', '怀中', '抱着', '个', '女婴', '。']
['女婴', '啼哭', '不休', '。', '她', '母亲', '温言', '相呵', '，', '女婴', '只是', '大哭', '。']
['囚车', '旁', '一', '清兵', '恼', '了', '，', '伸腿', '在', '车上', '踢', '了', '一脚', '，', '喝道', '：', '“', '再', '哭', '，', '再', '哭', '！', '”']
['老子', '踢', '死', '你', '！', '”']


In [6]:
# === 英文测试数据 ===

# 选自《鹿鼎记》英译本
#《The Deer and The Cauldron》（闵福德译）

en_text = """
Along a coastal road somewhere south of the Yangtze River, a detachment of soldiers, each of them armed with a halberd, was escorting a line of seven prison carts, trudging northwards in the teeth of a bitter wind.
In each of the first three carts a single male prisoner was caged, identifiable by his dress as a member of the scholar class. One was a white-haired old man. The other two were men of middle years.
The four rear carts were occupied by women, the last of them by a young mother holding a baby girl at her breast.
The little girl was crying in a continuous wail which her mother's gentle words of comfort were powerless to console.
One of the soldiers marching alongside, irritated by the baby's crying, aimed a mighty kick at the cart. 'Stop it! Shut up!
Or I'll really give you something to cry about!'
"""

In [7]:
# === 调用大模型 API：英文批量分词 ===

# --- 注意 ---
# 为节省 API 调用成本
# 大模型生成内容保存于本地缓存 data/llm_cache
# 完成首次调用后，再次调用只需从本地数据库读取生成结果

# 若需测试 API 连接是否正常
# 可更换测试数据，重新标注

print(f'=== 英文批量分词 ===\n')

# 逐行遍历所有数据
for index, text in enumerate(en_text.strip().splitlines()):
    print(f'[ID]: {index+1:05d}')
    print(text)
    print(f'{"-" * 60}')
    
    # 构建提示词
    prompt = prompt_tmpl.format(lang='English', text=text.strip())
    
    # 调用大模型分词
    tokens = client.get_response(prompt=prompt, json_output=True)
    print(tokens)
    print(f'{"=" * 60}')

=== 英文批量分词 ===

[ID]: 00001
Along a coastal road somewhere south of the Yangtze River, a detachment of soldiers, each of them armed with a halberd, was escorting a line of seven prison carts, trudging northwards in the teeth of a bitter wind.
------------------------------------------------------------
['Along', 'a', 'coastal', 'road', 'somewhere', 'south', 'of', 'the', 'Yangtze', 'River', ',', 'a', 'detachment', 'of', 'soldiers', ',', 'each', 'of', 'them', 'armed', 'with', 'a', 'halberd', ',', 'was', 'escorting', 'a', 'line', 'of', 'seven', 'prison', 'carts', ',', 'trudging', 'northwards', 'in', 'the', 'teeth', 'of', 'a', 'bitter', 'wind', '.']
[ID]: 00002
In each of the first three carts a single male prisoner was caged, identifiable by his dress as a member of the scholar class. One was a white-haired old man. The other two were men of middle years.
------------------------------------------------------------
['In', 'each', 'of', 'the', 'first', 'three', 'carts', 'a', 'single', 'mal