In [1]:
"""
现代汉语拼音标注引擎（轻量级教学版）
环境要求：Python 3.8+，仅需pypinyin和jieba两个依赖库
"""

import re
from collections import defaultdict
from pypinyin import pinyin, Style, load_phrases_dict, load_single_dict
from outils import load_cn_json, dump_cn_json, dump_cn_json_compact
import jieba

In [3]:
with open("xiandaihaiyuchangyongcibiao.txt", "r", encoding="utf-8") as f:
    content = f.readlines()

In [11]:
# 制作中文词语到拼音的映射
ciyin = {}  # 词语拼音映射表，将中文词语映射到它的拼音
n0 = 0
for line in content:
    raws = line.rstrip("\n").split("\t")
    cy = []
    for yin in raws[1].split("'"):
        if len(yin) and yin[-1] in "0123456789":
            cy.append((yin[:-1], int(yin[-1])))
        else:
            cy.append((yin, 0))
            n0 += 1
    ciyin[raws[0]] = cy

dump_cn_json_compact("frequent_word_pinyin.json", ciyin)

In [12]:
len(ciyin)

55735

In [18]:
def contains_lowercase_letter(s):
    """
    判断字符串中是否包含小写英文字母
    :param s: 输入字符串
    :return: 如果包含小写英文字母，返回True；否则返回False
    """
    for char in s:
        if char.islower():
            return True
    return False

# 示例测试
test_strings = ["Hello", "WORLD", "12345", "abc", "迸发jin!", "A1b2C3"]
for test in test_strings:
    print(f"'{test}' 是否包含小写英文字母: {contains_lowercase_letter(test)}")

'Hello' 是否包含小写英文字母: True
'WORLD' 是否包含小写英文字母: False
'12345' 是否包含小写英文字母: False
'abc' 是否包含小写英文字母: True
'迸发jin!' 是否包含小写英文字母: True
'A1b2C3' 是否包含小写英文字母: True


In [27]:
ciyin = load_cn_json("frequent_word_pinyin.json")
len(ciyin)

55810

In [42]:
with open("upchars.txt", "r", encoding="utf-8") as f:
    content = f.readlines()

ziyin = {}
codeyin = {}
codezi = {}
valen = {}
for line in content[2:]:
    if line.startswith("U"):
        raws = line.rstrip("\n").split(":")
        val = raws[0]
        raws = raws[1].split("#")
        yin = tuple([a.strip() for a in raws[0].strip().split(",")])
        zi = raws[1].strip()
        codeyin[val] = yin
        ziyin[zi] = yin
        codezi[val] = zi

dump_cn_json_compact("large_ziyin.json", ziyin)
dump_cn_json_compact("large_ziyin_by_unicode.json", codeyin)
dump_cn_json_compact("large_hanzi_by_unicode.json", codezi)

In [42]:
with open("upwords.txt", "r", encoding="utf-8") as f:
    content = f.readlines()

n = 2
comments = []
ciyin = {}
for line in content[n:]:
    n += 1
    raws = [a.strip() for a in line.rstrip("\n").split(":")]
    ci = raws[0]
    yin = [a.strip() for a in raws[1].split("#")[0].strip().split(" ")]
    ciyin[ci] = yin
    if "#" in line:
        comments.append(line)

len(ciyin), len(comments)

(377960, 1)

In [3]:
ciyin = load_cn_json("large_ciyin.json")
ciyin["有的放矢"]

[['you', 3], ['di', 4], ['fang', 4], ['shi', 3]]

In [48]:
def standard_to_numbered(pinyin_str, fen=False):
    """
    将标准拼音格式（如“zhōng”）转换为字母加数字的格式（如“zhong1”）
    输入：
    pinyin_str (str): 标准拼音字符串
    fen (bool): 是否将输出分开
    输出：
    无音调拼音加数字（str）或 (拼音,数字) 元组（(str, int)）
    """
    tone_map = {
        'ā': 'a1', 'á': 'a2', 'ǎ': 'a3', 'à': 'a4',
        'ē': 'e1', 'é': 'e2', 'ě': 'e3', 'è': 'e4',
        'ī': 'i1', 'í': 'i2', 'ǐ': 'i3', 'ì': 'i4',
        'ō': 'o1', 'ó': 'o2', 'ǒ': 'o3', 'ò': 'o4',
        'ū': 'u1', 'ú': 'u2', 'ǔ': 'u3', 'ù': 'u4',
        'ǖ': 'v1', 'ǘ': 'v2', 'ǚ': 'v3', 'ǜ': 'v4',
        "ń": 'n2', "ň": 'n3', "ǹ": 'n4',
        "m̄": "m1", "ḿ": "m2", "m̀": 'm4'
    }

    for tone_char, replacement in tone_map.items():
        if tone_char in pinyin_str:
            if fen:
                return (pinyin_str.replace(tone_char, replacement[0]).replace("ü", "v"), int(replacement[1]))
            else:
                return pinyin_str.replace(tone_char, replacement[0]).replace("ü", "v") + replacement[1]
    
    if fen:
        return (pinyin_str.replace("ü", "v"), 0)
    else:
        return pinyin_str.replace("ü", "v") + "0"

# print(standard_to_numbered("hǎo"))

def check_numbered_pinyin(s):
    if "a" in s:
        pies = s.split("a")
        if len(pies) > 2:
            return False
        for c in pies[0]:
            if c in ("e", "o", "v"):
                return False
        for c in pies[1]:
            if c in ("e", "v", "u"):
                return False
    if "o" in s:
        pies = s.split("o")
        if len(pies) > 2:
            return False
        for c in pies[0]:
            if c in ("e", "v"):
                return False
        for c in pies[1]:
            if c in ("a", "e", "i", "v"):
                return False
    if "u" in s:
        pies = s.split("u")
        if len(pies) > 2:
            return False
        for c in pies[0]:
            if c in ("e", "v"):
                return False
        for c in pies[1]:
            if c in ("e", "v", "u"):
                return s[0] in ("j", "q", "x") and c == "e"
    return True

def merge_numbered(s, n=0):
    return s + str(n)

def numbered_to_standard(pinyin_str):
    # 检查是否符合拼音规则
    if not check_numbered_pinyin(pinyin_str):
        print("不符合规则")
        return pinyin_str
    
    # 轻声：结尾没有数字或数字不是1234
    if not pinyin_str[-1] in "0123456789":
        return pinyin_str.replace("v", "ü")
    elif not pinyin_str[-1] in "1234":
        return pinyin_str[:-1].replace("v", "ü")
    else:
        tone = int(pinyin_str[-1]) - 1  # 1234
        tone_map = {
            "a": ('ā','á','ǎ','à'),
            "e": ('ē','é','ě','è'),
            "i": ('ī','í','ǐ','ì'),
            "o": ('ō','ó','ǒ','ò'),
            "u": ('ū','ú','ǔ','ù'),
            "v": ('ǖ','ǘ','ǚ','ǜ'),
            "n": ("n","ń","ň","ǹ"),
            "m": ("m̄","ḿ","m","m̀")
        }

        for c in pinyin_str:  # 如果有a或e则标在上面
            if c in ("a", "e"):
                return pinyin_str[:-1].replace(c, tone_map[c][tone]).replace("v", "ü")
        for c in pinyin_str:  # 否则如果有o就标在o上面（ou, io）
            if c == "o":
                return pinyin_str[:-1].replace(c, tone_map[c][tone]).replace("v", "ü")
        for c in pinyin_str:  # 否则如果有i或u就标在上面（i, u, iu, ui），并列时标在后
            if c in ("i", "u"):
                if "i" in pinyin_str and "u" in pinyin_str:
                    return pinyin_str[:-1].replace("iu", "i"+tone_map["u"][tone]).replace("ui", "u"+tone_map["i"][tone]).replace("v", "ü")
                return pinyin_str[:-1].replace(c, tone_map[c][tone]).replace("v", "ü")
        for c in pinyin_str:  # 否则如果有v就标在v上面
            if c == "v":
                return pinyin_str[:-1].replace(c, tone_map[c][tone])
        for c in pinyin_str:  # 否则标在n或m上面
            if c in ("n", "m"):
                return pinyin_str[:-1].replace(c, tone_map[c][tone])
        return pinyin_str[:-1]  # 语法正确
         
# samples = ["huai2", "xue4", "hua1", "zhuang4", "lie4", "lve", "liao3", "huo3", "lei2", "hui5", "kong4", "xiong1", "mao4", "liu6", "biang1", "jiao4", "jia3", "ke1", "bo1", "lu4", "yin2", "ma0", "mai2", "xie4", "xiu1", "xv1", "xiang2", "ng2", "nv3", "m2"]
# for sample in samples:
#     out = numbered_to_standard(sample)
#     back = standard_to_numbered(out, fen=True)
#     print(sample, "-->", out, "-->", back)

ciyin_numbered = {}
for ci, yins in ciyin.items():
    ciyin_numbered[ci] = [standard_to_numbered(yin, fen=True) for yin in yins]

len(ciyin_numbered)

377960

In [57]:
dump_cn_json_compact("large_ciyin.json", ciyin_numbered)

In [None]:
path_xx = "../src/小学/阅读课文.json"
lessons = load_cn_json(path_xx)
lessons_list = [lessons[k] for k in lessons.keys()]


中文标点 = "，。！？“”‘’；：……、《》（）~"
worded_texts = {}

for idx, lesson in lessons.items():
    worded_texts[idx] = []
    if lesson["format"] == "散文":
        for text in lesson["content"]:
            worded_texts[idx].append(jieba.lcut(re.sub(f"[{中文标点}]", "", re.sub(r"\\apost{.+?}", "", text)), cut_all=False))
    elif lesson["format"] == "书信":
        for text in lesson["content"]:
            if not len(text):
                break
            worded_texts[idx].append(jieba.lcut(re.sub(f"[{中文标点}]", "", re.sub(r"\\apost{.+?}", "", text)), cut_all=False))
    elif lesson["format"] == "诗歌":
        for part in lesson["content"]:
            for text in part:
                worded_texts[idx].append(jieba.lcut(re.sub(f"[{中文标点}]", "", re.sub(r"\\apost{.+?}", "", text)), cut_all=False))



In [None]:
from pypinyin import pinyin, Style

pinyin(["数", "数不清", "数数看", "掠过", "能量"], style=Style.TONE, heteronym=True)

In [71]:
# yins = []
# for para in worded_texts["杏儿熟了"]:
    # print(para)
    # for text in para:
    #     yins = pinyin(text, style=Style.TONE, heteronym=False)
    #     print(text, "\t\t", " ".join([yin[0] for yin in yins]))
    # print("--")

# len(yins), len(para)

title = "杏儿熟了"
for para in lessons[title]["content"]:
    print(para)

我们家院子里有一棵大杏树。每年到了麦收时节，树上就结满了黄澄澄的杏儿。从我家门前路过的人，总要望望那棵杏树，羡慕地说道：“呵，好杏儿呀！”
杏树是奶奶亲手栽的。听奶奶说，栽杏树的时候还没有我呢。有一回，我偎依在奶奶的怀里问她：“奶奶，咱们家树上的杏儿有多少个？”
“多得数不清啊。要不，你数数看。”
我仰着头数起来，“一、二、三……”数呀，数呀，数到后来就糊涂了。奶奶忍不住笑了。我不知道奶奶是笑我傻， 还是笑杏儿多。
这一年杏子又熟了。有一天，奶奶正在做饭，忽然听见有孩子在哭。奶奶急忙走出去，原来是邻居家的小淘淘偷摘杏儿，不小心从树上摔下来了。一块儿来的小伙伴见了奶奶都低下了头，不敢吱声。我没好气地说：“你们这些馋猴儿，偷吃人家的东西，摔了活该！”
我心里想：“看我奶奶怎么收拾你们！”奶奶走过去扶起淘淘，给他揉揉腿，看他没伤着，就站起身往屋里走，又回过头来对孩子们说：“你们先别走。”
过了一会儿，奶奶拿了一根长竹竿从屋里出来了。她走到树下，挑熟了的杏子往下打。她脚底下站不大稳，身子颤颤巍巍的。
杏儿一个接一个落在地上。我连忙弯腰去捡，不一会儿就捡了一衣兜。奶奶把小淘淘和他的伙伴都叫了过来，一人分给五六个，剩下的几个给了我。看他们吃得那样香甜，奶奶的嘴角上露出了微笑。我有点儿不高兴，奶奶却笑着说：“果子大家吃才香甜。要记住，杏儿熟了，让乡亲们都尝尝鲜。''
听了奶奶的话，我点了点头。以后，我每年都照奶奶的吩咐，把熟透了的杏儿分给小伙伴们吃，也送给邻居的叔叔婶子们尝鲜。
今年的杏儿又熟了。望着黄澄澄的杏儿挂满了枝头，我眼前又出现了奶奶颤颤巍巍地打杏儿的情景。于是，我挑熟透了的杏儿打下一些来，给乡亲们送去——给他们送去香甜，也给他们送去喜悦。


In [14]:
import jieba
import re
import numpy as np

def load_textbook(path):
    textbook = load_cn_json(path)
    
    nx = np.sort(np.array([text['numero'] for _, text in textbook.items()]))
    lessons = []
    for i in nx:
        for _, text in textbook.items():
            if text['numero'] == i:
                lessons.append(text)
    for lesson in lessons:
        lesson["numero"] = int((lesson["numero"] + 1) / 10)

    return lessons

# path_sz = "../src/小学/发蒙识字.json"
# lessons = load_textbook(path_sz)
path_xx = "../src/小学/阅读课文.json"
lessons = load_cn_json(path_xx)
lesson = lessons[14]
print(lesson["title"])

中文标点 = "，。！？“”‘’；：……、《》（）~"

words = []
for text in lesson["content"]:
    words.append(jieba.lcut(re.sub(f"[{中文标点}]", "", text), cut_all=False))

words

工农兵


[['工人', '力量', '大'],
 ['农民', '力量', '大'],
 ['士兵', '力量', '大'],
 ['好好学习', '天天向上'],
 ['团结起来', '保卫国家'],
 ['中国', '人民', '大团结', '世界', '人民', '大团结']]

In [7]:
words

[['工人', '力量', '大', '。'],
 ['农民', '力量', '大', '。'],
 ['士兵', '力量', '大', '。'],
 ['好好学习', '，', '天天向上', '。'],
 ['团结起来', '，', '保卫国家', '。'],
 ['中国', '人民', '大团结', '，', '世界', '人民', '大团结', '。']]

In [5]:
with open("gemini_result.txt", "r", encoding="utf-8") as f:
    content = f.readlines()
    res_gm = []
    for line in content:
        if len(line) > 1:
            res_gm.append(line[:-1].split(" "))

with open("ds_result.txt", "r", encoding="utf-8") as f:
    content = f.readlines()
    res_ds = []
    for line in content:
        if len(line) > 1:
            res_ds.append(line[:-1].split(" "))

res_ds

[['wǒ',
  'men',
  'jiā',
  'yuàn',
  'zi',
  'lǐ',
  'yǒu',
  'yī',
  'kē',
  'dà',
  'xìng',
  'shù',
  '。',
  'měi',
  'nián',
  'dào',
  'le',
  'mài',
  'shōu',
  'shí',
  'jié',
  '，',
  'shù',
  'shàng',
  'jiù',
  'jié',
  'mǎn',
  'le',
  'huáng',
  'dēng',
  'dēng',
  'de',
  'xìng',
  'r',
  '。',
  'cóng',
  'wǒ',
  'jiā',
  'mén',
  'qián',
  'lù',
  'guò',
  'de',
  'rén',
  '，',
  'zǒng',
  'yào',
  'wàng',
  'wang',
  'nà',
  'kē',
  'xìng',
  'shù',
  '，',
  'xiàn',
  'mù',
  'dì',
  'shuō',
  'dào',
  '：“',
  'hē',
  '，',
  'hǎo',
  'xìng',
  'r',
  'ya',
  '！”'],
 ['xìng',
  'shù',
  'shì',
  'nǎi',
  'nai',
  'qīn',
  'shǒu',
  'zāi',
  'de',
  '。',
  'tīng',
  'nǎi',
  'nai',
  'shuō',
  '，',
  'zāi',
  'xìng',
  'shù',
  'de',
  'shí',
  'hòu',
  'hái',
  'méi',
  'yǒu',
  'wǒ',
  'ne',
  '。',
  'yǒu',
  'yī',
  'huí',
  '，',
  'wǒ',
  'wēi',
  'yī',
  'zài',
  'nǎi',
  'nai',
  'de',
  'huái',
  'lǐ',
  'wèn',
  'tā',
  '：“',
  'nǎi',
  'nai',
  '，',
  'zán',
  'm

In [2]:
import ssl
print(ssl.OPENSSL_VERSION)

OpenSSL 3.0.15 3 Sep 2024


In [3]:
import requests

url = "https://api.siliconflow.cn/v1/chat/completions"

payload = {
    "model": "deepseek-ai/DeepSeek-V3",
    "messages": [
        {
            "role": "user",
            "content": "中国大模型行业2025年将会迎来哪些机遇和挑战？"
        }
    ],
    "stream": False,
    "max_tokens": 512,
    "stop": ["null"],
    "temperature": 0.7,
    "top_p": 0.7,
    "top_k": 50,
    "frequency_penalty": 0.5,
    "n": 1,
    "response_format": {"type": "text"},
    "tools": [
        {
            "type": "function",
            "function": {
                "description": "<string>",
                "name": "<string>",
                "parameters": {},
                "strict": False
            }
        }
    ]
}
headers = {
    "Authorization": "sk-cvhaokjrvhdscwoxqntyyzvkmnrozlsdrdomyebiwkqfisct",
    "Content-Type": "application/json"
}

response = requests.request("POST", url, json=payload, headers=headers)

print(response.text)

"Invalid token"


In [8]:
import ssl
print(ssl.OPENSSL_VERSION)

OpenSSL 1.1.1k  25 Mar 2021
