In [3]:
import json
import os
import matplotlib.pyplot as plt
import numpy as np
from outils import read, keys, load_cn_json, dump_cn_json, 中转数, 数转中, set_char_colors, nice_print, sort_dict_with, dump_cn_json_compact

def wrap(s, wrapper="{}", keep_wrapper=False):
    if s:
        return wrapper[0] + s + wrapper[-1]
    if keep_wrapper:
        return wrapper
    return ""

def make_params(params, wrapper="[]", sep=","):
    return wrap(sep.join(params), wrapper)

def wrap_env(name, content, params=[], param_wrapper="[]", param_sep=","):
    out = r"\begin" + wrap(name) + make_params(params, wrapper=param_wrapper, sep=param_sep) + "\n"
    lines = content[:-1].split("\n")  # presume content ends with \n
    for line in lines:
        out += "    " + line + "\n"
    out += r"\end" + wrap(name) + "\n"
    return out

def wrap_method(method, content="", wrapper="{}", keep_wrapper=True, params=[], param_wrapper="[]", param_sep=","):
    return '\\' + method + make_params(params, wrapper=param_wrapper, sep=param_sep) + wrap(content, wrapper=wrapper, keep_wrapper=keep_wrapper)

def zihao(n):
    return wrap_method("zihao", str(n))

def package_update_xcolor(packages, texts):
    xcolor = packages["xcolor"]
    xcolor["defined_colors"] = {}
    for _, text in texts.items():
        if "character_colors" in text:
            for key, val in text["character_colors"].items():
                xcolor["defined_colors"][key] = val
    packages["xcolor"] = xcolor

def make_ctex_env(document_class="ctexbook", document_class_params=("12pt", "UTF-8","openany"), packages={"ctex": [], "titlesec": []}, mainfont="Mona Sans Light", lineskip="4pt", parskip="10pt", title="标题", author="", date=False, toc=True):
    """make header and footer for ctexbook environment. 
    header
    1. documentclass and parameters 
    2. packages
    3. geometry and fonts
    4. package setups
    5. global typesettings
    6. begin document
    footer
    1. end document
    """
    # ## header ##

    # document class
    header = r"\documentclass"+ make_params(document_class_params) + wrap(document_class) + "\n"
    
    # packages
    packages_str = ""
    for name in packages:
        # print(package)
        package_declarations = ""
        if "declarations" in packages[name]:
            package_declarations = make_params(packages[name]['declarations'])
        packages_str += r"\usepackage" + package_declarations + wrap(name) + "\n"
    # print(packages_str)
    header += packages_str + "\n"

    # geometry <-- geometry package
    if "geometry" in packages:
        geometry = packages["geometry"]
        paper_type = geometry["paper_size"]
        paddings = geometry["paddings"]
        left = paddings["left"]
        right = paddings["right"]
        top = paddings["top"]
        bottom = paddings["bottom"]
        header += wrap_method("geometry", f"{paper_type}paper,left={left},right={right},top={top},bottom={bottom}") + "\n"
    
    # fonts
    header += r"\renewcommand{\footnotesize}{\fontsize{8.5pt}{10.5pt}\selectfont}" + "\n"
    header += wrap_method("setmainfont", mainfont) + "\n"
    header += r"\setCJKmainfont[BoldFont=STZhongsong]{汉字之美仿宋GBK 免费}" + "\n"
    header += r"\xeCJKDeclareCharClass{CJK}{`0 -> `9}" + "\n"  # apply CJK font to numbers
    header += r"\xeCJKsetup{AllowBreakBetweenPuncts=true}" + "\n"  # line alignment

    if "footmisc" in packages:
        footnote_settings_content = "".join([r"{\ding{"+str(192+i)+r"}}" for i in range(10)])
        footnote_settings = wrap_method("DefineFNsymbols", footnote_settings_content, params=["circled"], param_wrapper="{}")
        header += footnote_settings + "\n"
        header += wrap_method("setfnsymbol", "circled") + "\n"

    # package setups
    # xpinyin
    if "xpinyin" in packages:
        pyr = packages['xpinyin']['ratio']  # size ratio
        vsep = packages['xpinyin']['vsep']  # vertical gap
        vsep_str = "vsep={" + vsep + "}"
        hsep = packages['xpinyin']['hsep']  # horizontal gap
        hsep_str = "hsep={" + f"{hsep} plus {hsep}" + "}"
        header += wrap_method("xpinyinsetup", f"ratio={pyr},{hsep_str},{vsep_str}") + "\n"  # pinyin settings

    # hanzibox
    if "hanzibox" in packages:
        hanzibox = packages["hanzibox"]
        frametype = hanzibox['frametype']
        framelinewidth = hanzibox['framelinewidth']
        width = hanzibox['width']
        resize = hanzibox['resize']
        framecolor = hanzibox["framecolor"]
        pinyinline = hanzibox['pinyinline']
        pinyinf = hanzibox['pinyinf']
        pinyincolor = hanzibox['pinyincolor']
        charcolor = hanzibox['charcolor']
        charf = "charf={" + hanzibox["charf"]["font"] + hanzibox["charf"]["fontsize"] + "}"
        header += wrap_method("hanziboxset", f"frametype={frametype},framelinewidth={framelinewidth},width={width},resize={resize},pinyinline={pinyinline},framecolor={framecolor},{charf},pinyinf={pinyinf},pinyincolor={pinyincolor},charcolor={charcolor}") + "\n"  # hanzibox settings

    # package setups
    # xcolor
    if "xcolor" in packages:
        defcolor_str = ""
        for key, (r, g, b) in packages["xcolor"]["defined_colors"].items():
            rgb_plate = f"{r},{g},{b}"
            defcolor_str += wrap_method("definecolor", key) + r"{RGB}{" + rgb_plate + r"}" + "\n"
        header += defcolor_str + "\n"

    # global typesettings
    # title format
    header += r"\titleformat{\chapter}{\zihao{-1}\bfseries}{ }{16pt}{}" + "\n"
    header += r"\titleformat{\section}{\zihao{-2}\bfseries}{ }{0pt}{}" + "\n"
    header += r"\title" + wrap(r"\zihao{0} \bfseries " + title) + "\n"
    # line and paragraph skips
    header += r"\setlength{\lineskip}{" + lineskip + "}\n"  # skip length after line
    header += r"\setlength{\parskip}{" + parskip + "}\n"  # extra skip for paragraphs 
    # front page format
    if author:  # author format
        header += r"\author{\zihao{2} \texttt" + wrap(author) + "}\n"
    else:
        header += r"\author{}" + "\n"
    if date:  # date format
        header += r"\date{\bfseries\today}" + "\n"
    else:
        header += r"\date{}" + "\n"
    
    # begin document
    header += r"\begin" + wrap("document") + "\n"
    header += r"\maketitle" + "\n"
    if toc:
        header += r"\tableofcontents" + "\n"
    header += r"\newpage" + "\n"
    
    # ## footer ##

    # end document
    footer = r"\end" + wrap("document") + "\n"
    return header, footer

def read_text(path, format="散文"):
    """Read raw text and formalize to json
    Inputs: 
    path (str): file path to the raw text.
    format (str): format of the text.
    Output:
    out (dict): a jsonifiable dictionary with formalized text.
    Example:
    out["format"]     : format of the text (in the sense of tex printing).
    out["genre"]      : genre and other tags of the text.
    out["content"]    : content of the text. A list of strings.
    out["grade"]      : recommanded student grade (for the purpose of eduation).
    out["title"]      : title of the text.
    out["author"]     : author of the text.
    out["remarks"]    : remarks concerning the text.
    out["footnotes"]  : footnotes of the content of the text.
    out["endnotes"]   : endnotes of the content of the text.
    out["vocabulary"] : vocabulary to learn (for the purpose of eduation).
    """
    lines = read(path)
    out = {}
    title = ""
    if len(lines) and len(lines[0]):
        author = ""
        grade = 0
        footnotes = []
        endnotes = []
        vocabulary = []
        remarks = []
        content = []
        out["format"] = format
        out["genre"] = [format]
        # return lines
        if format in ("散文", "书信", "小说", "剧本"):
            for line in lines:
                line0 = line.strip()
                if line0:
                    if not title:
                        title = line0
                    elif grade < 1 and line.startswith("年级："):
                        grade = int(line0[3:])
                    elif not author and line.startswith("作者："):
                        author = line0[3:]
                    elif line.startswith("备注："):
                        remarks.append(line0[3:])
                    elif line.startswith("注释："):
                        footnotes.append(line0[3:])
                    elif line.startswith("脚注："):
                        footnotes.append(line0[3:])
                    elif line.startswith("尾注："):
                        endnotes.append(line0[3:])
                    elif line.startswith("词汇："):
                        vocabulary.extend(line0[3:].split())
                    else:
                        content.append(line0)
        elif format == "诗歌":
            para = []
            for line in lines:
                line0 = line.strip()
                if line0:
                    if not title:
                        title = line0
                    elif grade < 1 and line.startswith("年级："):
                        grade = int(line0[3:])
                    elif not author and line.startswith("作者："):
                        author = line0[3:]
                    elif line.startswith("备注："):
                        remarks.append(line0[3:])
                    elif line.startswith("注释："):
                        footnotes.append(line0[3:])
                    elif line.startswith("脚注："):
                        footnotes.append(line0[3:])
                    elif line.startswith("尾注："):
                        endnotes.append(line0[3:])
                    elif line.startswith("词汇："):
                        vocabulary.extend(line0[3:].split())
                    else:
                        para.append(line0)
                elif len(para):
                    content.append("|#|".join(para))
                    para = []
            if len(para):
                content.append("|#|".join(para))
        # make footnotes dict
        footdict = {}
        i = 0
        keybase = "fn"
        content = "@".join(content)
        for note in footnotes:
            word = ""
            if note.startswith("〔"):
                word = note.split("〕")[0][1:]
                key = keybase + str(i+1)
                footdict[key] = note
                i += 1
            elif "〕" in note:  # key is already marked in the text with the format "\apost{a...}".
                key = note.split("〕")[0].split("〔")[0]
                footdict[key] = "".join(note.split(key)[1:])
            # print(word)
            if word:  # find the position to insert footnote and mark
                nfin = content.find(word) + len(word)
                content = content[:nfin] + r"\apost{" + key + "}" + content[nfin:]
        if "|#|" in content:
            content_new = []
            for para in content.split("@"):
                content_new.append(para.split("|#|"))
            content = content_new
        else:
            content = content.split("@")
        
        out["title"] = title
        out["author"] = author
        out["content"] = content
        out["remarks"] = remarks
        out["footnotes"] = footdict
        out["endnotes"] = endnotes
        out["vocabulary"] = vocabulary
        if grade:
            out["grade"] = grade
    return title, out

def text_content_to_tex_str(text, verbose=0, verseprop=0.5, format="散文", footnotes={}, endnotes=[]):
    """convert the content of a text to text string ready for tex.
    the format varies by genre:
    散文、小说
    书信
    诗歌
    剧本
    """ 
    content = text["content"]
    if "footnotes" in text:
        footnotes = text["footnotes"]
    if "format" in text:
        format = text["format"]
    out = ""
    if format in ("散文", "小说",):
        out = "\n\n".join(content) + "\n"
    elif  format == "书信":
        if verbose and not content[0].endswith("："):
            print("错误：第一行不是抬头")
            return "格式错误\n"
        out = r"\noindent " + content[0] + "\n\n" + wrap_method("vspace", "24pt") + "\n\n"
        toright = False
        toright_content = ""
        for line in content[1:]:
            if line:
                if toright:
                    toright_content += line + "\n\n"
                else:
                    out += line + "\n\n"
            else:
                toright = True
        out += wrap_method("vspace", "36pt") + "\n\n"
        out += wrap_env("flushright", toright_content) + "\n\n"
    elif  format == "诗歌":
        if not isinstance(content[0], list):
            content = [content]
        
        lineskip = " \\\\\n"
        # parskip = "\n" + wrap_method("vspace", "4pt") + "\n\n"
        parskip = "\n\n"
        out = parskip.join([wrap_env("verse", lineskip.join(par) + "\n", params=[str(verseprop)+"\\linewidth"]) for par in content])
    elif format == "剧本":
        name_set = text["characters"]
        for line in content:
            if line.startswith("\\item["):
                name = line.split("]")[0][6:]
                colored_name = r"{\color{" + name_set[name] + r"} " + name + r"}"
                out += "\\item[" + colored_name + "]" + "]".join(line.split("]")[1:])
            elif line.startswith("$"):
                colored_line = line
                for name in name_set:
                    colored_line = colored_line.replace(name, r"{\color{" + name_set[name] + r"} " + name + r"}")
                out += colored_line
            else:
                out += line
            out += "\n\n"
    for key in footnotes:
        out = out.replace("apost{"+key+"}", "footnote{" + footnotes[key] + "}")
    return out

def endnotes_to_str(endnotes, verbose=0, pinyin=False):
    """convert the endnotes to text string ready for tex."""
    out = ""
    notes = ""
    for note in endnotes:
        if pinyin and note.startswith("〔"):  # add pinyin
            suite = note[1:].split("〕")
            notes += r"\item " + note[0] + r"\xpinyin*{" + suite[0] + r"}〕" + r"〕".join(suite[1:]) + "\n"
        else:
            notes += r"\item " + note + "\n"
    if notes:
        out = r"\newpage" + "\n\n" + r"\textbf{注释}：" + "\n\n" + r"\vspace{-1em}" + "\n\n"
        out += wrap_env("itemize", r"\setlength\itemsep{-0.2em}" + "\n" + notes)
    return out

def shizi_to_str(zis, n=8):
    out = r"\clearpage" + "\n\n"
    boxes = ""
    i = 0
    for zi in zis:
        boxes += wrap_method("hanzibox", zi)
        i += 1
        if i == n:
            boxes += "\n\n"
            i = 0
    out += wrap_env("center", boxes + "\n\n")
    return out

def xiezi_to_str(zis, ncol=2, nex=4, hspace=1):
    out = ""
    boxes = ""
    i = 0
    for zi in zis:
        boxes += wrap_method("hanzibox", zi)
        for j in range(nex):
            boxes += wrap_method("hanzibox", "")
        i += 1
        if i == ncol:
            boxes += "\n\n"
            i = 0
        else:
            boxes += wrap_method("hspace", f"{hspace}em")
    out += boxes + "\n\n"
    # out += wrap_env("center", boxes + "\n\n")
    return out

def text_to_tex_str(text, typesettings={"font": {"title": {"size": 2}, "plaintext": {"size": "normalsize"}}, "vspaces": {"after_title": 12, "after_author": 6, "after_content": 6}}):
    """convert a text object to text string ready for tex
    """
    out = ""
    content = ""
    # title_fontsize = typesettings["font"]["title"]["size"]
    # title = wrap_method("textbf", zihao(title_fontsize) + " " + text["title"]) + "\n"
    title = wrap_method("chapter", text["title"]) + "\n\n"
    content += title
    # content = wrap_env("center", content) + "\n"
    # content += wrap_method("vspace", f"{typesettings['vspaces']['after_title']}pt") + "\n\n"
    content += wrap_env(typesettings["font"]["plaintext"]["size"], "\n" + text_content_to_tex_str(text) + "\n")
    # content += wrap_method("vspace", f"{typesettings['vspaces']['after_content']}pt") + "\n\n"
    out += content + "\n\n"
    # out += wrap_method("newpage", keep_wrapper=False) + "\n\n"
    if "endnotes" in text:
        out += endnotes_to_str(text["endnotes"])
    if "shizi" in text:
        nchars = 10
        if len(text["shizi"]) % 10 == 1:
            nchars = 8
        out += shizi_to_str(text["shizi"], n=nchars) + "\n\n"
        if "xiezi" in text:
            out += xiezi_to_str(text["xiezi"]) + "\n\n"
    return out

def add_text(texts, title, content, format="散文", tags=[]):
    """Add a text to the dictionary of texts.
    Inputs:
    texts (dict): dictionary of texts. title --> content.
    title (str): title of the text.
    content (dict): content of the text.
    format (str): format of the text.
    tags (list of str): tags to describe the text.
    Output:
    texts: updated dictionary of texts. 
    """
    if len(tags):
        content["genre"] = tags
    if format == "剧本":
        if title not in texts:
            script_keys = []
            for _, text in texts.items():
                if text["format"] == "剧本" and "key" in text:
                    script_keys.append(int(text["key"].split("-")[1]))
            if len(script_keys):
                script_key = "script-" + str(max(script_keys) + 1)
            else:
                script_key = "script-1"
                
        else:
            script_key = texts[title]["key"]
        name_set, color_set = set_char_colors(content["content"], script_key)
        
    texts[title] = content
    if format == "剧本":
        texts[title]["key"] = script_key
        texts[title]["characters"] = name_set
        texts[title]["character_colors"] = color_set
    return texts


In [4]:
# 打印页面设置：纸号，页边距等
geometry = {}
geometry["paper_size"] = "a5"  # 使用A5纸
paddings = {}  # 页边距
paddings["left"] = "1.4cm"
paddings["right"] = "1.4cm"
paddings["top"] = "2.3cm"
paddings["bottom"] = "2.3cm"
geometry["paddings"] = paddings

# 拼音设置： xpinyin宏包
pinyin = {}
pinyin["ratio"] = "0.5"
pinyin["hsep"] = ".6em"
pinyin["vsep"] = "1em"

# 田字格设置：hanzibox宏包
# \hanziboxset{frametype=咪,framelinewidth=0.5pt,width=1.0cm,resize=real,pinyinline=true,framecolor=red,charf={\kaishu\huge},pinyinf=\scriptsize,pinyincolor=green!30!black,charcolor=green!30!black}
hanzibox = {}
hanzibox["frametype"] = "咪"
hanzibox["framelinewidth"] = "0.5pt"
hanzibox["width"] = "0.9cm"
hanzibox["resize"] = "real"
hanzibox["pinyinline"] = "true"
hanzibox["framecolor"] = "red"
hanzibox["pinyinf"] = r"\scriptsize"
hanzibox["charf"] = {"font": r"\kaishu", "fontsize": r"\huge"}
hanzibox["pinyincolor"] = r"green!30!black"
hanzibox["charcolor"] = r"green!30!black"


## 小学

In [12]:
path_xx = "../src/小学/"

# 打印小学古诗（分层）
packages = {}
packages["ctex"] = []
packages["titlesec"] = []
packages["xeCJK"] = []
packages["fontspec,xunicode,xltxtra"] = []
packages["xpinyin"] = pinyin
packages["xpinyin"]['ratio'] = "0.44"
packages["xpinyin"]['hsep'] = ".6em"
packages["geometry"] = geometry
packages["indentfirst"] = []
packages["pifont"] = []
packages["footmisc"] = {"declarations": ["perpage", "symbol*"]}
lineskip = "24pt"
parskip = "6pt"

### 小学诗歌

In [7]:
def shi_to_tex_str(shi, print_genre=False, authors={}, typesettings={"vspaces": {"after_title": 8, "after_author": 6, "after_content": 6}}):
    # convert structured shi to string ready to use in tex
    out = r"\section{" + shi["title"] + "}\n\n"
    content = ""
    # title = wrap_method("textbf", zihao(3) + " " + shi["title"]) + "\n\n"

    # if print_genre:
    #     title = shi["genre"] + "：" + title
    # content += title
    content += wrap_method("vspace", f"{typesettings['vspaces']['after_title']}pt") + "\n\n"
    author_str = ""
    if shi["author"]:
        author = shi["author"]
        if author in authors:
            author_str += "〔唐代：" + author + "〕\n\n"
        else:
            author_str += "〔" + author + "〕\n\n"
    else:
        author_str += "〔作者不详〕\n\n"
    content += wrap_env("normalsize", "\n" + author_str) + "\n"
    content += wrap_method("vspace", f"{typesettings['vspaces']['after_author']}pt") + "\n\n"
    content += wrap_env("large", "\n" + "\n\n".join([wrap_method("xpinyin*", line) for line in shi["content"]]) + "\n\n") + "\n"
    content = wrap_env("center", content) + "\n"
    content += wrap_method("vspace", f"{typesettings['vspaces']['after_content']}pt") + "\n\n"
    out += content
    return out

In [8]:
shis = load_cn_json(os.path.join(path_xx, "古诗.json"))
output_tex = "古诗集.tex"
title = "小学语文古诗集"
# shis = load_cn_json(os.path.join(path_xx, "唐诗三百首.json"))
# output_tex = "唐诗三百首.tex"
# title = "唐诗三百首"

header, footer = make_ctex_env(packages=packages, title=title, parskip=parskip, lineskip=lineskip)

# 分层
shi_by_level = {}
levels = []
for i in range(10):
    levels.append(f"第{数转中[i+1]}层") 
levels.append("其他")
# print(levels)

typesettings = {"vspaces": {"after_title": 10, "after_author": 8, "after_content": 8}}

for title, shi in shis.items():
    level = shi["level"]
    if level not in shi_by_level:
        shi_by_level[level] = {}
    shi_by_level[level][title] = shi

with open(output_tex, "w", encoding="utf-8") as f:
    f.write(header + "\n")
    for level in levels:
        f.write(r"\chapter" + wrap(level) + "\n\n")
        for title, shi in shi_by_level[level].items():
            f.write(shi_to_tex_str(shi, typesettings=typesettings) + "\n")
    f.write(footer)

In [None]:
def xiezi_to_str(zis, ncol=2, nex=3, hspace=1):
    out = "\n\n"
    boxes = ""
    i = 0
    for zi in zis:
        boxes += wrap_method("hanzibox", zi)
        for j in range(nex):
            boxes += wrap_method("hanzibox", "")
        i += 1
        if i == ncol:
            boxes += "\n\n"
            i = 0
        else:
            boxes += wrap_method("hspace", f"{hspace}em")
    out += wrap_env("center", boxes + "\n\n")
    return out

print(xiezi_to_str(texts_sz["一二三四五"]["xiezi"]))

### 小学现代文

In [3]:
def read_text(path, format="散文"):
    """Read raw text and formalize to json
    Inputs: 
    path (str): file path to the raw text.
    format (str): format of the text.
    Output:
    out (dict): a jsonifiable dictionary with formalized text.
    Example:
    out["format"]     : format of the text (in the sense of tex printing).
    out["genre"]      : genre and other tags of the text.
    out["content"]    : content of the text. A list of strings.
    out["grade"]      : recommanded student grade (for the purpose of eduation).
    out["title"]      : title of the text.
    out["author"]     : author of the text.
    out["remarks"]    : remarks concerning the text.
    out["footnotes"]  : footnotes of the content of the text.
    out["endnotes"]   : endnotes of the content of the text.
    out["vocabulary"] : vocabulary to learn (for the purpose of eduation).
    """
    lines = read(path)
    out = {}
    title = ""
    if len(lines) and len(lines[0]):
        author = ""
        grade = 0
        footnotes = []
        endnotes = []
        vocabulary = []
        remarks = []
        content = []
        out["format"] = format
        out["genre"] = [format]
        # return lines
        if format in ("散文", "书信", "小说", "剧本"):
            for line in lines:
                line0 = line.strip()
                if line0:
                    if not title:
                        title = line0
                    elif grade < 1 and line.startswith("年级："):
                        grade = int(line0[3:])
                    elif not author and line.startswith("作者："):
                        author = line0[3:]
                    elif line.startswith("备注："):
                        remarks.append(line0[3:])
                    elif line.startswith("注释："):
                        footnotes.append(line0[3:])
                    elif line.startswith("脚注："):
                        footnotes.append(line0[3:])
                    elif line.startswith("尾注："):
                        endnotes.append(line0[3:])
                    elif line.startswith("词汇："):
                        vocabulary.extend(line0[3:].split())
                    else:
                        content.append(line0)
        elif format == "诗歌":
            para = []
            for line in lines:
                line0 = line.strip()
                if line0:
                    if not title:
                        title = line0
                    elif grade < 1 and line.startswith("年级："):
                        grade = int(line0[3:])
                    elif not author and line.startswith("作者："):
                        author = line0[3:]
                    elif line.startswith("备注："):
                        remarks.append(line0[3:])
                    elif line.startswith("注释："):
                        footnotes.append(line0[3:])
                    elif line.startswith("脚注："):
                        footnotes.append(line0[3:])
                    elif line.startswith("尾注："):
                        endnotes.append(line0[3:])
                    elif line.startswith("词汇："):
                        vocabulary.extend(line0[3:].split())
                    else:
                        para.append(line0)
                elif len(para):
                    content.append("|#|".join(para))
                    para = []
            if len(para):
                content.append("|#|".join(para))
        # make footnotes dict
        footdict = {}
        i = 0
        keybase = "fn"
        content = "@".join(content)
        for note in footnotes:
            word = ""
            if note.startswith("〔"):
                word = note.split("〕")[0][1:]
                key = keybase + str(i+1)
                footdict[key] = note
                i += 1
            elif "〕" in note:  # key is already marked in the text with the format "\apost{a...}".
                key = note.split("〕")[0].split("〔")[0]
                footdict[key] = "".join(note.split(key)[1:])
            # print(word)
            if word:  # find the position to insert footnote and mark
                nfin = content.find(word) + len(word)
                content = content[:nfin] + r"\apost{" + key + "}" + content[nfin:]
        if "|#|" in content:
            content_new = []
            for para in content.split("@"):
                content_new.append(para.split("|#|"))
            content = content_new
        else:
            content = content.split("@")
        
        out["title"] = title
        out["author"] = author
        out["content"] = content
        out["remarks"] = remarks
        out["footnotes"] = footdict
        out["endnotes"] = endnotes
        out["vocabulary"] = vocabulary
        if grade:
            out["grade"] = grade
    return title, out

def text_content_to_tex_str(text, verbose=0, verseprop=0.5, format="散文", footnotes={}, endnotes=[]):
    """convert the content of a text to text string ready for tex.
    the format varies by genre:
    散文、小说
    书信
    诗歌
    剧本
    """ 
    content = text["content"]
    if "footnotes" in text:
        footnotes = text["footnotes"]
    if "format" in text:
        format = text["format"]
    out = ""
    if format in ("散文", "小说",):
        out = "\n\n".join(content) + "\n"
    elif  format == "书信":
        if verbose and not content[0].endswith("："):
            print("错误：第一行不是抬头")
            return "格式错误\n"
        out = r"\noindent " + content[0] + "\n\n" + wrap_method("vspace", "24pt") + "\n\n"
        toright = False
        toright_content = ""
        for line in content[1:]:
            if line:
                if toright:
                    toright_content += line + "\n\n"
                else:
                    out += line + "\n\n"
            else:
                toright = True
        out += wrap_method("vspace", "36pt") + "\n\n"
        out += wrap_env("flushright", toright_content) + "\n\n"
    elif  format == "诗歌":
        if not isinstance(content[0], list):
            content = [content]
        
        lineskip = " \\\\\n"
        # parskip = "\n" + wrap_method("vspace", "4pt") + "\n\n"
        parskip = "\n\n"
        out = parskip.join([wrap_env("verse", lineskip.join(par) + "\n", params=[str(verseprop)+"\\linewidth"]) for par in content])
    elif format == "剧本":
        name_set = text["characters"]
        for line in content:
            if line.startswith("\\item["):
                name = line.split("]")[0][6:]
                colored_name = r"{\color{" + name_set[name] + r"} " + name + r"}"
                out += "\\item[" + colored_name + "]" + "]".join(line.split("]")[1:])
            elif line.startswith("$"):
                colored_line = line
                for name in name_set:
                    colored_line = colored_line.replace(name, r"{\color{" + name_set[name] + r"} " + name + r"}")
                out += colored_line
            else:
                out += line
            out += "\n\n"
    for key in footnotes:
        out = out.replace("apost{"+key+"}", "footnote{" + footnotes[key] + "}")
    return out

def endnotes_to_str(endnotes, verbose=0, pinyin=False):
    """convert the endnotes to text string ready for tex."""
    out = ""
    notes = ""
    for note in endnotes:
        if pinyin and note.startswith("〔"):  # add pinyin
            suite = note[1:].split("〕")
            notes += "\item " + note[0] + r"\xpinyin*{" + suite[0] + r"}〕" + "〕".join(suite[1:]) + "\n"
        else:
            notes += "\item " + note + "\n"
    if notes:
        out = r"\newpage" + "\n\n" + r"\textbf{注释}：" + "\n\n" + r"\vspace{-1em}" + "\n\n"
        out += wrap_env("itemize", r"\setlength\itemsep{-0.2em}" + "\n" + notes)
    return out

def shizi_to_str(zis, n=8):
    out = "\clearpage" + "\n\n"
    boxes = ""
    i = 0
    for zi in zis:
        boxes += wrap_method("hanzibox", zi)
        i += 1
        if i == n:
            boxes += "\n\n"
            i = 0
    out += wrap_env("center", boxes + "\n\n")
    return out

def xiezi_to_str(zis, ncol=2, nex=4, hspace=1):
    out = ""
    boxes = ""
    i = 0
    for zi in zis:
        boxes += wrap_method("hanzibox", zi)
        for j in range(nex):
            boxes += wrap_method("hanzibox", "")
        i += 1
        if i == ncol:
            boxes += "\n\n"
            i = 0
        else:
            boxes += wrap_method("hspace", f"{hspace}em")
    out += boxes + "\n\n"
    # out += wrap_env("center", boxes + "\n\n")
    return out

def text_to_tex_str(text, typesettings={"font": {"title": {"size": 2}, "plaintext": {"size": "normalsize"}}, "vspaces": {"after_title": 12, "after_author": 6, "after_content": 6}}):
    """convert a text object to text string ready for tex
    """
    out = ""
    content = ""
    # title_fontsize = typesettings["font"]["title"]["size"]
    # title = wrap_method("textbf", zihao(title_fontsize) + " " + text["title"]) + "\n"
    title = wrap_method("chapter", text["title"]) + "\n\n"
    content += title
    # content = wrap_env("center", content) + "\n"
    # content += wrap_method("vspace", f"{typesettings['vspaces']['after_title']}pt") + "\n\n"
    content += wrap_env(typesettings["font"]["plaintext"]["size"], "\n" + text_content_to_tex_str(text) + "\n")
    # content += wrap_method("vspace", f"{typesettings['vspaces']['after_content']}pt") + "\n\n"
    out += content + "\n\n"
    # out += wrap_method("newpage", keep_wrapper=False) + "\n\n"
    if "endnotes" in text:
        out += endnotes_to_str(text["endnotes"])
    if "shizi" in text:
        nchars = 10
        if len(text["shizi"]) % 10 == 1:
            nchars = 8
        out += shizi_to_str(text["shizi"], n=nchars) + "\n\n"
        if "xiezi" in text:
            out += xiezi_to_str(text["xiezi"]) + "\n\n"
    return out

def add_text(texts, title, content, format="散文", tags=[]):
    """Add a text to the dictionary of texts.
    Inputs:
    texts (dict): dictionary of texts. title --> content.
    title (str): title of the text.
    content (dict): content of the text.
    format (str): format of the text.
    tags (list of str): tags to describe the text.
    Output:
    texts: updated dictionary of texts. 
    """
    if len(tags):
        content["genre"] = tags
    if format == "剧本":
        if title not in texts:
            script_keys = []
            for _, text in texts.items():
                if text["format"] == "剧本" and "key" in text:
                    script_keys.append(int(text["key"].split("-")[1]))
            if len(script_keys):
                script_key = "script-" + str(max(script_keys) + 1)
            else:
                script_key = "script-1"
                
        else:
            script_key = texts[title]["key"]
        name_set, color_set = set_char_colors(content["content"], script_key)
        
    texts[title] = content
    if format == "剧本":
        texts[title]["key"] = script_key
        texts[title]["characters"] = name_set
        texts[title]["character_colors"] = color_set
    return texts


In [13]:
packages = {}
packages["ctex"] = []
packages["titlesec"] = []
packages["xeCJK"] = []
packages["verse"] = []
packages["fontspec,xunicode,xltxtra"] = []
packages["xpinyin"] = pinyin
packages["geometry"] = geometry
packages["indentfirst"] = []
packages["pifont"] = []
packages["footmisc"] = {"declarations": ["perpage", "symbol*"]}
lineskip = "24pt"
parskip = "6pt"

In [14]:
texts_xx = load_cn_json("../src/小学/阅读课文.json")

text_format = "剧本"
tags = ["戏曲", "节选", "名著", "古白话文", "悲剧"]
title, content = read_text("草稿.tex", format=text_format)

if title:
    print(f"新增课文：{title}")
    texts = add_text(texts_xx, title, content, text_format, tags)
    dump_cn_json("../src/小学/阅读课文.json", texts_xx)

In [15]:
texts_xx = load_cn_json("../src/小学/阅读课文.json")

booktitle = "小学语文课文集萃"
header, footer = make_ctex_env(packages=packages, title=booktitle, parskip=parskip, lineskip=lineskip)
typesettings = {}
typesettings["vspaces"] = {"after_title": 36, "after_author": 16, "after_content": 16}
typesettings["font"] = {"plaintext": {"size": "large"}}

with open("小学现代文阅读课文.tex", "w", encoding="utf-8") as f:
    f.write(header + "\n")
    for title, text in sort_dict_with(texts_xx):
        # print(title)
        if text["grade"] > 0:
            f.write(text_to_tex_str(text, typesettings=typesettings) + "\n")
    f.write(footer)

In [139]:
texts_xx = load_cn_json("../src/小学/阅读课文.json")

grade_count = {}
title_by_grade = {}
title_by_genre = {}
genre_by_grade = {}
for title, text in texts_xx.items():
    if "grade" not in text:
        text["grade"] = 1
    g = text["grade"]
    if g not in grade_count:
        grade_count[g] = 0
    if g not in title_by_grade:
        title_by_grade[g] = []
    if g not in genre_by_grade:
        genre_by_grade[g] = {}
    for genre in text["genre"]:
        if genre not in genre_by_grade[g]:
            genre_by_grade[g][genre] = []
        if genre not in title_by_genre:
            title_by_genre[genre] = []
    grade_count[text["grade"]] += 1
    title_by_grade[text["grade"]].append(title)
    for genre in text["genre"]:
        genre_by_grade[g][genre].append(title)
        title_by_genre[genre].append(title)

for g in title_by_grade:
    print(f"{g} 年级 ({len(title_by_grade[g])})")
    nice_print(title_by_grade[g])

2 年级 (16)
['狼和小羊', '翠鸟', '揠苗助长', '守株待兔', '初冬']
['秋天', '坐井观天', '骆驼和羊', '狐狸和乌鸦', '曹冲称象']
['乌鸦喝水', '狐狸和公鸡', '老狼分饼', '叶公好龙', '十二月花名歌']
['画蛇添足']
3 年级 (48)
['茅以升立志造桥', '美丽的小兴安岭', '大海的歌', '让我们荡起双桨', '小马过河']
['刻舟求剑 ', '八角楼上', '赵州桥', '南京长江大桥', '雨']
['放风筝', '荷花', '掩耳盗铃', '自相矛盾', '滥竽充数']
['惊弓之鸟', '绿色的办公室', '黄继光', '颐和园', '五彩池']
['青蛙的眼睛', '爬山虎的脚', '课间十分钟', '日出', '捞铁牛']
['纸上谈兵', '趵突泉', '鸟的天堂', '桂林山水', '天安门广场']
['火烧云', '卢沟桥的狮子', '海上日出', '董存瑞舍身炸碉堡', '十里长街送总理']
['狐狸和山羊', '燕子', '晏子使楚', '狼牙山五壮士', '我的战友邱少云']
['草原', '马踏飞燕', '伏尔加河上的纤夫', '牛郎织女的故事', '搭船的鸟']
['狐假虎威', '塞翁失马', '买椟还珠']
4 年级 (54)
['我和企鹅', '白求恩大夫（节选改编）', '我的弟弟“小萝卜头”', '帐篷', '参观人民大会堂']
['海底世界', '故乡的杨梅', '杏儿熟了', '春蚕', '李时珍']
['画杨桃', '珍贵的教科书', '爸爸和书', '小珊迪', '劳动最有滋味']
['花生花', '种子', '观潮', '高大的皂荚树', '海滨小城']
['蝙蝠和雷达', '各种各样的玻璃', '糖画', '西门豹', '中国石']
['古井', '峨眉道上', '太阳', '绿叶', '九寨沟']
['兵马俑', '冬眠', '七月的天山', '小英雄雨来', '参观刘家峡水电站']
['小站', '挑山工', '可爱的草塘', '雪猴', '鲸']
['圆明园的毁灭', '喂药（汤姆索亚历险记节选）', '阁楼（小公主节选）', '冀中的地道战', '草船借箭']
['田忌赛马', '记金华的双龙洞', '丰碑', '镜泊湖奇观', '伟大

## 中学

In [16]:
texts_xx = load_cn_json("../src/小学/阅读课文.json")
texts_cz = load_cn_json("../src/中学/阅读课文.json")

titles1 = set(list(texts_xx.keys()))
titles2 = set(list(texts_cz.keys()))

titles1 & titles2

set()

### 现代文

In [17]:
packages = {}
packages["ctex"] = []
packages["titlesec"] = []
packages["xeCJK"] = []
packages["verse"] = []
packages["fontspec,xunicode,xltxtra"] = []
packages["xpinyin"] = pinyin
packages["geometry"] = geometry
packages["indentfirst"] = []
packages["pifont"] = []
packages["enumitem"] = []
packages["footmisc"] = {"declarations": ["perpage", "symbol*"]}
xcolor = {}
xcolor["declarations"] = ["table", "dvipsnames"]
packages["xcolor"] = xcolor

typesettings = {}
typesettings["vspaces"] = {"after_title": 36, "after_author": 16, "after_content": 16}
typesettings["font"] = {"plaintext": {"size": "normalsize"}}

In [437]:
def sort_zs(path):
    lines = read("草稿.tex")

    out = []
    zs = False
    for line in lines:
        if line[0] in "0987654321":
            zs = True
            continue
        if zs:
            parts = line.split("：")
            out.append(f"注释：〔{parts[0]}〕" + "：".join(parts[1:]))
            zs = False
        else:
            newline ="".join([w for w in line if w not in "0987654321"])
            out.append(newline)

    with open("草稿.tex", "w", encoding="utf-8") as f:
        f.writelines(out)

In [18]:
texts_cz = load_cn_json("../src/中学/阅读课文.json")

import codecs
import matplotlib.pyplot as plt
import numpy as np
from random import shuffle

text_format = "剧本"
tags = ["戏曲", "节选", "名著", "古白话文", "悲剧"]
title, content = read_text("草稿.tex", format=text_format)

if title:
    print(f"新增课文：{title}")
    texts = add_text(texts_cz, title, content, text_format, tags)
    dump_cn_json("../src/中学/阅读课文.json", texts_cz)

In [19]:
booktitle = "中学语文课文集萃"
texts_cz = load_cn_json("../src/中学/阅读课文.json")

lineskip = "24pt"
parskip = "6pt"
package_update_xcolor(packages, texts_cz)
header, footer = make_ctex_env(packages=packages, title=booktitle, parskip=parskip, lineskip=lineskip)
with open("中学现代文阅读课文.tex", "w", encoding="utf-8") as f:
    f.write(header + "\n")
    for title, text in sort_dict_with(texts_cz):
        # print(title)
        if text["grade"] > 0:
            f.write(text_to_tex_str(text, typesettings=typesettings) + "\n")
        # break
    f.write(footer)

In [10]:
grade = 11
print([(g, grade_count[g]) for g in range(7, 13)])
print(sum([grade_count[g] for g in range(7, 13)]))
nice_print(title_by_grade[grade])

KeyError: 7

In [6]:
for title, text in texts_cz.items():
    if text["author"] == "鲁迅":
        if "节选" in text["genre"]:
            print(title+ "（节选）")
        else:
            print(title)

从百草园到三味书屋
阿长与山海经
论雷峰塔的倒掉
“友邦惊诧”论
社戏
故乡
藤野先生
孔乙己
中国人失掉自信力了吗
拿来主义
祝福
聪明人和傻子和奴才
记念刘和珍君
《呐喊》自序
药
阿Q正传（节选）


In [9]:
texts_xx = load_cn_json("../src/小学/阅读课文.json")
grade_count = {}
title_by_grade = {}
title_by_genre = {}
genre_by_grade = {}
for title, text in texts_xx.items():
    if "grade" not in text:
        text["grade"] = 1
    g = text["grade"]
    if g not in grade_count:
        grade_count[g] = 0
    if g not in title_by_grade:
        title_by_grade[g] = []
    if g not in genre_by_grade:
        genre_by_grade[g] = {}
    for genre in text["genre"]:
        if genre not in genre_by_grade[g]:
            genre_by_grade[g][genre] = []
        if genre not in title_by_genre:
            title_by_genre[genre] = []
    grade_count[text["grade"]] += 1
    title_by_grade[text["grade"]].append(title)
    for genre in text["genre"]:
        genre_by_grade[g][genre].append(title)
        title_by_genre[genre].append(title)

res = [(k,len(v)) for k, v in title_by_genre.items()]

# alist = res
def printsort_int(alist, rev=False):
    ma = max([b for (_, b) in alist])
    tem = [[] for _ in range(ma+1)]
    for (a, b) in alist:
        tem[b].append(a)
    
    if rev:
        for i, a in enumerate(tem[::-1]):
            if len(a):
                print(ma-i, a)
    else:
        for i, a in enumerate(tem):
            if len(a):
                print(i, a)

printsort_int(res, True)

87 ['记叙文']
55 ['散文']
26 ['说明文']
25 ['寓言']
23 ['描写文']
20 ['小说']
18 ['回忆', '文言文翻译']
16 ['人物', '地方介绍']
14 ['成语故事']
12 ['报告文学']
11 ['科普']
10 ['游记']
9 ['名人故事']
8 ['抒情']
7 ['诗歌']
5 ['动物']
4 ['事物介绍', '纪实文学', '描写']
3 ['人物介绍', '借事说理', '议论文', '童话', '纪实']
2 ['借物喻理', '名著', '节选', '写景', '歌词', '民谣']
1 ['古文翻译', '笔记', '借物抒情', '书信', '科幻', '神话传说', '幻想', '经典', '声明', '应用文', '言志', '民俗', '植物', '时令', '儿歌', '白话文']


In [7]:
# [title for title in texts_cz if texts_cz[title]['author'] == "鲁迅"]
texts_cz = load_cn_json("../src/中学/阅读课文.json")
grade_count = {}
title_by_grade = {}
title_by_genre = {}
genre_by_grade = {}
for title, text in texts_cz.items():
    g = text["grade"]
    if g not in grade_count:
        grade_count[g] = 0
    if g not in title_by_grade:
        title_by_grade[g] = []
    if g not in genre_by_grade:
        genre_by_grade[g] = {}
    for genre in text["genre"]:
        if genre not in genre_by_grade[g]:
            genre_by_grade[g][genre] = []
        if genre not in title_by_genre:
            title_by_genre[genre] = []
    grade_count[text["grade"]] += 1
    title_by_grade[text["grade"]].append(title)
    for genre in text["genre"]:
        genre_by_grade[g][genre].append(title)
        title_by_genre[genre].append(title)

res = [(k,len(v)) for k, v in title_by_genre.items()]

# alist = res
def printsort_int(alist, rev=False):
    ma = max([b for (_, b) in alist])
    tem = [[] for _ in range(ma+1)]
    for (a, b) in alist:
        tem[b].append(a)
    
    if rev:
        for i, a in enumerate(tem[::-1]):
            if len(a):
                print(ma-i, a)
    else:
        for i, a in enumerate(tem):
            if len(a):
                print(i, a)

printsort_int(res, True)

32 ['小说']
28 ['散文']
26 ['经典']
23 ['抒情', '诗歌']
19 ['节选']
17 ['议论文']
16 ['自然', '现实主义', '浪漫主义']
15 ['回忆']
14 ['说明文']
12 ['景物', '科普']
11 ['人物']
10 ['批判现实主义']
9 ['演讲', '意象']
8 ['倡议', '叙事']
7 ['记叙文', '论述', '象征主义']
6 ['社会百态']
5 ['纪实', '讽刺']
4 ['四季', '建筑', '鼓动']
3 ['名著', '游记']
2 ['童话', '借物抒情', '动物', '报告文学', '书信', '驳论', '议论', '想象', '序言', '话剧', '意识流', '说理']
1 ['悼词', '抗议', '友情', '纪实文学', '植物', '亲情', '提出问题', '戏剧', '墙头诗', '新闻稿', '对话录', '科学', '推理', '辩论', '批评', '传记', '寓言', '纪念', '都市童话', '插叙', '回信', '阐述', '剧本', '地方介绍', '公开信']


In [248]:
# nice_print(title_by_genre["自然"])
grade_count

{7: 43, 8: 36, 9: 24, 10: 15, 12: 2, 11: 1}

In [7]:
def get_content(text):
    out = ''
    content = text['content']
    if text['format'] == '诗歌':
        for block in content:
            for line in block:
                out += line
    else:
        for line in content:
            out += line
    return out

In [210]:
texts_cz = load_cn_json("../src/小学/阅读课文.json")

for title, text in texts_cz.items():
    if text["format"] == "诗歌":
        continue
    footnotes = text['footnotes']
    foots_new = {}
    keybase = "fn"
    content = text['content']
    content_new = []
    s = get_content(text)
    if s.find("footnote") < 0:  # no foonote numerated
        foots_new = {}
        i = 0
        out = "|#|".join(text["content"])
        for note in footnotes:
            word = ""
            if note.startswith("〔"):
                word = note.split("〕")[0][1:]
                key = keybase + str(i+1)
                foots_new[key] = note
                i += 1
            elif "〕" in note:  # key is already marked in the text with the format "\apost{a...}".
                key = note.split("〕")[0].split("〔")[0]
                foots_new[key] = "".join(note.split(key)[1:])
            # print(word)
            if word:  # find the position to insert footnote and mark
                nfin = out.find(word) + len(word)
                out = out[:nfin] + r"\apost{" + key + "}" + out[nfin:]
        content_new = out.split("|#|")
    else:  # footnote numerated
        for i, note in enumerate(footnotes):
            key = keybase + str(i+1)
            foots_new[key] = note
        i = 0  # counter for line
        j = 0  # counter for note
        line = content[i]
        while i < len(content) and j < len(footnotes):
            if line.find("footnote{"+str(j+1)+"}") < 0:  # if you cannot find a note in this line
                content_new.append(line)  # get to original line
                i += 1
                line = content[i]  # load the next line
            else:  # if you find a note in this line 
                key = keybase + str(j+1)
                line = line.replace("footnote{"+str(j+1)+"}", "apost{" + key + "}")  # replace
                j += 1  # move to the next note
        content_new.append(line)
        i += 1
        while i < len(content):
            line = content[i]
            content_new.append(line)
            i += 1            
            
    text['footnotes'] = foots_new
    text['content'] = content_new

In [213]:
dump_cn_json("../src/小学/阅读课文.json", texts_cz)

In [218]:
texts = load_cn_json("../src/小学/阅读课文.json")

for title, text in texts.items():
    if isinstance(text["footnotes"], list):
        text["footnotes"] = {}

dump_cn_json("../src/小学/阅读课文.json", texts)

In [25]:
lines = read("草稿.tex")

characters = []

for line in lines:
    line = line.rstrip()
    if line.startswith("［") and line.endswith("］") and line[-2] in ("云", "唱"):
        # print(line)
        content = line[1:-2]
        if content.endswith("，"):
            if "扮" in content:
                character = content.split("扮")[1].split("上")[0]
            else:
                character = content[:2]
            # print(character)
            characters.append(character)
        else:
            if "，" in content:
                character = content.split("，")[-1]
                # print(content)
                # print(character)
            else:
                character = content[:3]
                # print(character, content)
            characters.append(character)
        
characters

['监斩官',
 '刽子',
 '正旦',
 '刽子',
 '正旦',
 '刽子',
 '正旦',
 '卜儿',
 '刽子',
 '正旦',
 '刽子',
 '卜儿',
 '正旦',
 '刽子',
 '正旦',
 '监斩官',
 '正旦',
 '监斩官',
 '正旦',
 '刽子',
 '正旦',
 '监斩官',
 '正旦',
 '正旦',
 '监斩官',
 '正旦',
 '刽子',
 '刽子',
 '正旦',
 '监斩官',
 '刽子',
 '监斩官']

## 识字

In [13]:
path_sz = "../src/小学/发蒙识字.json"

In [43]:
# texts_sz = load_cn_json("../src/小学/发蒙识字 copy.json")
# texts_li = load_cn_json("../src/小学/发蒙识字.json")
# texts_nu = {}

# # for text in texts_li:
# #     title = text["title"]
# #     for text2 in texts_sz:
# #         if text2["title"]
# titles = {}
# titles_2 = {}
# for name, text in texts_sz.items():
#     title = text["title"]
#     titles[name] = title
#     if title in titles_2:
#         print(title, name, titles_2[title])
#     titles_2[title] = name

# for name, title in titles.items():
#     if name != title:
#         print(name, title)

# for text in texts_li:
#     name = titles_2[text["title"]]
#     texts_nu[name] = text

# old = set(list(texts_sz.keys()))
# new = set(list(texts_nu.keys()))

# # old - new, new - old

# for name in titles:
#     if texts_sz[name]["title"] != texts_nu[name]["title"]:
#         print(name, texts_sz[name]["title"], texts_nu[name]["title"])

# dump_cn_json_compact("../src/小学/发蒙识字.json", texts_nu)

人一 人
家 我有一个家
你开心吗 你开心吗？


In [4]:
texts_sz = load_cn_json("../src/小学/发蒙识字.json")
dump_cn_json_compact("../src/小学/发蒙识字.json", texts_sz)

In [19]:
import matplotlib.pyplot as plt

texts_sz = load_cn_json("../src/小学/发蒙识字 copy.json")

# arrange the texts by numero
nx = np.sort(np.array([text['numero'] for idx, text in texts_sz.items()]))
texts_nu = []
for i in nx:
    for idx, text in texts_sz.items():
        if text['numero'] == i:
            texts_nu.append(text)

n = 0
b = 10
for text in texts_nu:
    n += b
    text["numero"] = n
    print(n, text["title"])

10 我是中国人
20 一二三四五
30 人
40 田
50 比大小
60 开门
70 山村
80 你我他
90 田鸟
100 鸟鱼虫
110 山羊
120 方向
130 我有一个家
140 看地图
150 工农兵
160 张开口
170 这是什么
180 你吃什么
190 谁比我高
200 时间
210 问路
220 过马路
230 你叫什么名字
240 画
250 下雨啦
260 小猫读书
270 今天天气好
280 车子吃什么油
290 小舟
300 我有一张床
310 我的笔盒
320 找朋友
330 你开心吗？
340 切西瓜
350 丢手绢
360 秋天
370 画彩虹
380 风筝
390 大桥
400 中秋
410 拍皮球
420 你知不知道
430 雪地里的小画家
440 二月二
450 菜市场
460 大扫除
470 东海龙宫
480 江南
490 鹅
500 春天在哪里
510 看星星
520 小燕子


In [21]:
L1 = "人,一,二,三,四,五,六,七,八,九,十,口,手,足,头,耳,目,牙,日,月,山,水,火,土,天,地,田,雨,中,上,下,大,小,多,少,爸,妈,有,在,个,门,开,学,不,来,去,见,工,农,兵,力".split(",")
L2 = "生,你,他,比,走,跑,跳,叫,吃,喝,看,听,说,青,草,羊,牛,马,鸟,鱼,虫,黄,里,飞,点,方,早,阳,左,右,东,南,西,北,前,后,家,儿,子,男,女,哥,弟,姐,妹,和,只,要,车,电,用,行,道,线,红,绿,灯,牙,几,间,嘴,外,江,船,以,自,能,问,先,往,路,公,园,直,再,就,请,花,黑,白,来,去,见,在,个".split(",")
L3 = "时,间,季,年,秒,现,半,没,高,屋,到,谁,说,凉,群,树,叶,从,字,排,落,啊,要,种,果,发,长,知,广,深,海,老,虎,肉,答,哪,眼,睛,影,湖,唱,雪,鸭,狗,笔,步,用,蛙,洞,为,找,朋,友,笑,床,被,枕,柜,台,衣,服,干,读,书,页,合,气,两,朵,背,包,太,阳,空,今,明,春,秋,哭,买,卖,猫,冬,晴,玩,星,晚,午,菜,饭,米,面,奶,茶,校,店,市,村,河".split(",")

In [8]:
import re
from collections import defaultdict, Counter
from pathlib import Path
import os
from datetime import datetime

def add_suffix(filename, path):
    # 获取当前日期并格式化为 YYYYMMDD 格式
    time_suffix = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    
    # 分离文件名和扩展名
    name, extension = filename.rsplit('.', 1)
    project_name = os.path.splitext(os.path.basename(path))[0]
    
    # 添加日期后缀并返回新文件名
    new_filename = f"{name}_{project_name}_{time_suffix}.{extension}"
    return new_filename

def load_textbook(path):
    textbook = load_cn_json(path)
    
    nx = np.sort(np.array([text['numero'] for _, text in textbook.items()]))
    lessons = []
    for i in nx:
        for _, text in textbook.items():
            if text['numero'] == i:
                lessons.append(text)
    for lesson in lessons:
        lesson["numero"] = int((lesson["numero"] + 1) / 10)
    
    return lessons

def analyze_textbook(path, report_path):
    """教材分析主函数"""
    # 读取并预处理数据
    lessons = load_textbook(path)
    
    # 初始化数据结构
    analysis = {
        'chars': {
            'shizi': defaultdict(list),  # {字: [出现的课号]}
            'xiezi': defaultdict(list),
            'first_shizi': {},  # 字首次出现在shizi的课号
            'first_xiezi': {},
            'text_chars': defaultdict(set)  # 每课实际出现的汉字
        },
        'issues': {
            'duplicate_shizi': defaultdict(list),
            'duplicate_xiezi': defaultdict(list),
            'writing_before_reading': [],
            'writing_after_reading': [],
            'content_mismatch': defaultdict(list),
            'unlearned_chars': defaultdict(list),
            'reappearance': defaultdict(lambda: {'count':0, 'intervals':[]})
        }
    }
    
    # 预处理阶段：收集字符出现信息
    # =================================================================
    for lesson in lessons:
        num = lesson['numero']
        
        # 记录shizi/xiezi出现情况
        for char in lesson['shizi']:
            analysis['chars']['shizi'][char].append(num)
            if char not in analysis['chars']['first_shizi']:
                analysis['chars']['first_shizi'][char] = num
                
        for char in lesson['xiezi']:
            analysis['chars']['xiezi'][char].append(num)
            if char not in analysis['chars']['first_xiezi']:
                analysis['chars']['first_xiezi'][char] = num
                
        # 提取课文中的汉字（去除非汉字字符）
        text = ''.join([
            re.sub('？', '', re.sub(r'[^\u4e00-\u9fff]', '', line))
            for part in lesson['content']
            for line in (part if isinstance(part, list) else [part])
        ])
        analysis['chars']['text_chars'][num] = re.sub('？', '', re.sub(r'[^\u4e00-\u9fff]', '', lesson["title"])) + " " + text
    
    # 问题检测逻辑
    # =================================================================
    # 检测1：重复的识字/写字任务
    for char, nums in analysis['chars']['shizi'].items():
        if len(nums) > 1:
            analysis['issues']['duplicate_shizi'][char] = nums
            
    for char, nums in analysis['chars']['xiezi'].items():
        if len(nums) > 1:
            analysis['issues']['duplicate_xiezi'][char] = nums
    
    # 检测2：写字先于识字
    for char, xiezi_first in analysis['chars']['first_xiezi'].items():
        shizi_first = analysis['chars']['first_shizi'].get(char, float('inf'))
        if shizi_first > xiezi_first:
            analysis['issues']['writing_before_reading'].append({
                'char': char,
                'xiezi_first': xiezi_first,
                'shizi_first': shizi_first if shizi_first != float('inf') else '从未出现'
            })
    
    # 检测：识字后是否学习写字
    for char, shizi_first in analysis['chars']['first_shizi'].items():
        xiezi_first = analysis['chars']['first_xiezi'].get(char, float('inf'))
        if xiezi_first >= shizi_first:
            analysis['issues']['writing_after_reading'].append({
                'char': char,
                'xiezi_first': xiezi_first if xiezi_first != float('inf') else '从未出现',
                'shizi_first': shizi_first,
                "lag": xiezi_first - shizi_first if xiezi_first != float('inf') else -1
            })

    # 检测3：学习内容是否在课文中
    for lesson in lessons:
        num = lesson['numero']
        text_chars = analysis['chars']['text_chars'][num]
        
        # 检查识字
        for char in lesson['shizi']:
            if char not in text_chars:
                analysis['issues']['content_mismatch'][num].append(
                    f"识字 '{char}' 未在课文出现"
                )
                
        # 检查写字
        for char in lesson['xiezi']:
            if char not in text_chars:
                analysis['issues']['content_mismatch'][num].append(
                    f"写字 '{char}' 未在课文出现"
                )
                
        # 检查词语（去除标点后检查）
        for ci in lesson['ci']:
            clean_ci = re.sub('？', '', re.sub(r'[^\u4e00-\u9fff]', '', ci))
            if clean_ci not in ''.join(text_chars):
                analysis['issues']['content_mismatch'][num].append(
                    f"词语 '{ci}' 未在课文出现"
                )
    
    # 检测4：未学先现
    for num in analysis['chars']['text_chars']:
        char_set = set(analysis['chars']['text_chars'][num])
        for char in char_set:
            if char != " ":
                # 该字在后续课程中才被列为shizi
                first_shizi = analysis['chars']['first_shizi'].get(char, float('inf'))
                if num < first_shizi:
                    analysis['issues']['unlearned_chars'][num].append({
                        'char': char,
                        'first_shizi': first_shizi if first_shizi != float('inf') else '从未学习',
                        "lag": first_shizi - num if first_shizi != float('inf') else -1
                    })
    
    # 检测5：复现统计
    shizi_order = sorted(analysis['chars']['first_shizi'].items(), key=lambda x: x[1])
    for i, (char, first_num) in enumerate(shizi_order):
        # 获取后续课程
        subsequent = [l for l in lessons if l['numero'] > first_num]
        prev = first_num
        for lesson in subsequent:
            if char in analysis['chars']['text_chars'][lesson['numero']]:
                interval = lesson['numero'] - prev
                prev = lesson['numero']
                analysis['issues']['reappearance'][char]['count'] += 1
                analysis['issues']['reappearance'][char]['intervals'].append(interval)
    
    # return analysis
    # 生成报告
    # =================================================================
    brief_report = generate_brief_report(analysis, lessons)
    full_report = generate_full_report(analysis, lessons)
    
    print("=== 简要检查报告 ===")
    print(brief_report)
    
    report_final_path = add_suffix(report_path, path)
    Path(report_final_path).write_text(full_report, encoding='utf-8')

    print(f"\n完整报告已保存至：{report_final_path}")
    return analysis

def generate_brief_report(analysis, lessons):
    """生成命令行简要报告"""
    report = []
    
    # 基础统计
    report.append(f"教材总课数：{len(lessons)}课")
    report.append(f"总识字量：{len(analysis['chars']['shizi'])}字")
    report.append(f"总写字量：{len(analysis['chars']['xiezi'])}字")
    
    # 问题汇总
    report.append("\n=== 问题汇总 ===")
    report.append(f"1. 重复识字任务：{len(analysis['issues']['duplicate_shizi'])}字")
    report.append(f"2. 重复写字任务：{len(analysis['issues']['duplicate_xiezi'])}字")
    report.append(f"3. 写字先于识字：{len(analysis['issues']['writing_before_reading'])}处")
    report.append(f"4. 内容不匹配：{sum(len(v) for v in analysis['issues']['content_mismatch'].values())}处")
    report.append(f"5. 未学先现字符：{sum(len(v) for v in analysis['issues']['unlearned_chars'].values())}次")

    # 识写距离
    rw_lags = np.array([int(v["lag"]) for v in analysis['issues']['writing_after_reading'] if v["lag"] >= 0])
    # print("rw_lags", rw_lags)
    lag_min = np.min(rw_lags)
    lag_max = np.max(rw_lags)
    lag_avg = np.mean(rw_lags[rw_lags>0])
    lag_med = np.median(rw_lags[rw_lags>0])
    
    report.append("\n=== 识写间隔统计 ===")
    report.append(f"最小距离：{lag_min}课")
    report.append(f"最大距离：{lag_max}课")
    report.append(f"平均距离：{lag_avg:.2f}课")
    report.append(f"中位距离：{lag_med:.2f}课")
    values, counts = np.unique(rw_lags, return_counts=True)
    bar_values = np.sort(values)[::-1]
    if len(bar_values) > 4:
        bar_values = bar_values[4]
    else:
        bar_values = 1
    frequency = dict(zip(values, counts))
    for i in sorted(values):
        if i >= bar_values:
            report.append(f"- 距离{i:3d}课：{frequency[i]}字。")

    # 预现统计
    aprioris = [c["lag"] for v in analysis['issues']['unlearned_chars'].values() for c in v if c["lag"] > 0]
    never_again = [c["lag"] for v in analysis['issues']['unlearned_chars'].values() for c in v if c["lag"] < 0]
    report.append("\n=== 预现统计 ===")
    report.append(f"平均预现距离：{sum(aprioris)/len(aprioris):.2f}课")
    report.append(f"未学字数：{len(never_again)}字")
    
    # 复现统计
    reappear_counts = [s['count'] for s in analysis['issues']['reappearance'].values()]
    report.append("\n=== 复现统计 ===")
    report.append(f"平均复现次数：{sum(reappear_counts)/len(reappear_counts):.1f}次")
    report.append(f"未复现字数：{len([c for c in reappear_counts if c ==0])}字")
    
    return '\n'.join(report)

def generate_full_report(analysis, lessons):
    """生成完整详细报告"""
    report = []
    
    # 头部信息
    report.append("教材分析完整报告\n")
    report.append(f"分析课程范围：第{lessons[0]['numero']}课 - 第{lessons[-1]['numero']}课")
    report.append("-"*50)
    
    # 详细问题列表
    def format_issue_list(title, items, formatter):
        if not items:
            return []
        output = [f"\n【{title}】"]
        for item in items:
            output.append(formatter(item))
        return output
    
    # 1. 重复识字
    report.extend(format_issue_list(
        "重复识字任务",
        analysis['issues']['duplicate_shizi'].items(),
        lambda x: f"字 '{x[0]}' 在以下课程重复出现：{x[1]}"
    ))
    
    # 2. 重复写字
    report.extend(format_issue_list(
        "重复写字任务", 
        analysis['issues']['duplicate_xiezi'].items(),
        lambda x: f"字 '{x[0]}' 在以下课程重复出现：{x[1]}"
    ))
    
    # 3. 写字先于识字
    report.extend(format_issue_list(
        "写字先于识字",
        analysis['issues']['writing_before_reading'],
        lambda x: f"字 '{x['char']}': 第{x['xiezi_first']}课要求写字，但第{x['shizi_first']}课才要求识字"
    ))
    
    # 4. 内容不匹配
    report.append("\n【课文内容匹配问题】")
    for num in sorted(analysis['issues']['content_mismatch']):
        issues = analysis['issues']['content_mismatch'][num]
        report.append(f"第{num}课：")
        report.extend([f"  - {issue}" for issue in issues])

    # 5. 先识后写
    rw_lags = np.array([int(v["lag"]) for v in analysis['issues']['writing_after_reading'] if v["lag"] >= 0])
    # print("rw_lags", rw_lags)
    values, counts = np.unique(rw_lags, return_counts=True)
    # frequency = dict(zip(values, counts))
    samples = {val: [] for val in values}

    report.append("\n【识写间隔】")
    for item in analysis['issues']['writing_after_reading']:
        if "lag" in item and item["lag"] > 0:
            samples[item["lag"]].append(item)
    
    for val, count in zip(values, counts):
        report.append(f"- 间隔{val:3d}课的生字（共{count:3d}个）：")
        for item in samples[val]:
            report.append(f"    - {item['char']}: 第{item['shizi_first']}课 → 第{item['xiezi_first']}课")
    
    # 6. 未学先现
    report.append("\n【未学先现字符】")
    for num in sorted(analysis['issues']['unlearned_chars']):
        items = analysis['issues']['unlearned_chars'][num]
        for item in items:
            if item["lag"] < 0:
                report.append(f"第{num}课出现预学字：")
                break
        for item in items:
            if item["lag"] < 0:
                # report.append(f"  - '{item['char']}'（首次学习于第{item['first_shizi']}课）")
            # else:
                report.append(f"  - '{item['char']}'{item['first_shizi']}")
    
    # 7. 复现分析
    report.append("\n【生字复现分析】")
    report.append("评估标准：")
    report.append("  优秀：复现≥3次且间隔≤5课")
    report.append("  良好：复现≥2次且间隔≤8课")
    report.append("  需改进：未达上述标准")
    
    reappear_stats = []
    for char, stats in analysis['issues']['reappearance'].items():
        avg_interval = sum(stats['intervals'])/len(stats['intervals']) if stats['intervals'] else 0
        evaluation = "优秀" if stats['count']>=3 and max(stats['intervals']+[0])<=5 else \
                    "良好" if stats['count']>=2 and max(stats['intervals']+[0])<=8 else "需改进"
        reappear_stats.append((
            char,
            stats['count'],
            f"{avg_interval:.1f}" if stats['count'] else "无复现",
            evaluation
        ))
    
    # 按复现次数排序
    reappear_stats.sort(key=lambda x: (-x[1], x[0]))
    

    report.append("\n复现不达标：")
    report.append("汉字 | 复现次数 | 平均间隔|")
    report.append("----|--------|--------|")
    for item in reappear_stats:
        if item[3] == "需改进":
            report.append(f"{item[0]} | {item[1]} | {item[2]} |")

    report.append("\n复现情况详情：")
    report.append("汉字 | 复现次数 | 平均间隔 | 评估 |")
    report.append("----|--------|--------|---- |")
    for item in reappear_stats:
        report.append(f"{item[0]} | {item[1]} | {item[2]} | {item[3]} |")
    
    return '\n'.join(report)

analysis = analyze_textbook("../src/小学/发蒙识字.json", "../out/report.txt")

=== 简要检查报告 ===
教材总课数：53课
总识字量：486字
总写字量：373字

=== 问题汇总 ===
1. 重复识字任务：0字
2. 重复写字任务：0字
3. 写字先于识字：0处
4. 内容不匹配：2处
5. 未学先现字符：38次

=== 识写间隔统计 ===
最小距离：0课
最大距离：25课
平均距离：9.50课
中位距离：9.00课
- 距离 17课：1字。
- 距离 18课：2字。
- 距离 19课：3字。
- 距离 20课：1字。
- 距离 25课：1字。

=== 预现统计 ===
平均预现距离：7.60课
未学字数：13字

=== 复现统计 ===
平均复现次数：3.6次
未复现字数：0字

完整报告已保存至：../out/report_发蒙识字_2025-02-07_15-00-34.txt


In [14]:
lessons = load_textbook(path_sz)
ziji = {}
nz = 0
zinumbs = []
zicount = []
for lesson in lessons:
    zis = []
    for zi in lesson['shizi']:
        if zi not in ziji:
            ziji[zi] = 0
            zis.append(zi)
    n = len(zis)
    nz += n
    zinumbs.append(n)
    zicount.append(nz)
    print(lesson['title'], nz, zis)

zinumbs = np.array(zinumbs)
zicount = np.array(zicount)
# print(zinumbs)
# print(zicount)
# plt.plot(zinumbs)

我是中国人 4 ['中', '人', '文', '上']
一二三四五 23 ['一', '二', '三', '四', '五', '金', '木', '水', '火', '土', '天', '地', '日', '月', '分', '见', '下', '今', '古']
人 33 ['头', '面', '身', '手', '足', '口', '牙', '目', '耳', '心']
田 43 ['山', '川', '风', '云', '雨', '田', '力', '禾', '苗', '实']
比大小 53 ['六', '七', '八', '九', '十', '比', '大', '小', '多', '少']
开门 61 ['有', '开', '门', '爸', '妈', '在', '个', '只']
山村 73 ['石', '不', '青', '鸡', '犬', '牛', '马', '闻', '村', '肥', '路', '归']
你我他 82 ['学', '生', '是', '你', '我', '他', '也', '们', '国']
田鸟 92 ['黄', '里', '麦', '鸟', '来', '飞', '吹', '点', '去', '了']
鸟鱼虫 100 ['鱼', '虫', '爪', '尾', '巴', '毛', '羽', '吃']
山羊 108 ['走', '叫', '羊', '草', '前', '后', '谁', '的']
方向 120 ['方', '早', '向', '太', '阳', '边', '左', '右', '东', '南', '西', '北']
我有一个家 131 ['儿', '子', '男', '女', '哥', '弟', '爱', '姐', '妹', '和', '家']
看地图 136 ['外', '出', '入', '看', '图']
工农兵 151 ['工', '农', '兵', '士', '民', '量', '好', '习', '起', '团', '结', '保', '卫', '世', '界']
张开口 157 ['张', '舌', '几', '颗', '唇', '间']
这是什么 161 ['什', '么', '这', '那']
你吃什么 168 ['兔', '猴', '猫', '捉', '桃', '花', '白']
谁比我高 173 

In [15]:
s7 = load_cn_json("simple700.json")
f9 = load_cn_json("frequent1000.json")

nin = []
n = 0
for lv in f9[:7]:
    ni = []
    n += 1
    for zi in lv:
        if zi in ziji:
            pass
        else:
            ni.append(zi)
    nin.append(ni)
    print("level", n, "total", len(lv), "rest", len(ni))
    if ni:
        nice_print(ni, n=10)

level 1 total 5 rest 0
level 2 total 12 rest 0
level 3 total 24 rest 1
['她']
level 4 total 38 rest 10
['而', '对', '于', '之', '都', '如', '事', '第', '样', '作']
level 5 total 61 rest 30
['总', '情', '己', '但', '些', '所', '同', '又', '意', '期']
['经', '回', '位', '因', '很', '给', '法', '斯', '次', '者']
['已', '亲', '其', '进', '此', '话', '常', '与', '正', '感']
level 6 total 92 rest 40
['理', '尔', '定', '本', '特', '孩', '相', '将', '全', '信']
['重', '每', '并', '别', '真', '才', '便', '夫', '部', '等']
['体', '却', '主', '利', '受', '表', '德', '克', '代', '员']
['许', '零', '由', '死', '写', '性', '或', '难', '教', '命']
level 7 total 148 rest 79
['拉', '神', '记', '让', '母', '父', '应', '平', '报', '关']
['至', '认', '接', '内', '英', '军', '候', '岁', '度', '带']
['解', '任', '原', '变', '通', '师', '立', '象', '失', '满']
['战', '格', '音', '条', '呢', '病', '达', '完', '求', '化']
['业', '思', '非', '罗', '钱', '积', '语', '元', '喜', '曾']
['离', '科', '言', '约', '各', '即', '反', '题', '必', '该']
['论', '交', '终', '医', '制', '决', '传', '运', '及', '则']
['房', '院', '苦', '品', '产', '精', '视', '连', '司']


In [7]:
packages = {}
packages["ctex"] = []
packages["titlesec"] = []
packages["xeCJK"] = []
packages["verse"] = []
packages["fontspec,xunicode,xltxtra"] = []
packages["xpinyin"] = pinyin
packages["hanzibox"] = hanzibox
packages["geometry"] = geometry
packages["indentfirst"] = []
packages["pifont"] = []
packages["enumitem"] = []
packages["footmisc"] = {"declarations": ["perpage", "symbol*"]}
xcolor = {}
xcolor["declarations"] = ["table", "dvipsnames"]
packages["xcolor"] = xcolor

typesettings = {}
typesettings["vspaces"] = {"after_title": 36, "after_author": 16, "after_content": 16}
typesettings["font"] = {"plaintext": {"size": "large"}}

booktitle = "发蒙识字"
texts_sz = load_cn_json("../src/小学/发蒙识字.json")

lineskip = "24pt"
parskip = "6pt"
package_update_xcolor(packages, texts_sz)
header, footer = make_ctex_env(packages=packages, title=booktitle, parskip=parskip, lineskip=lineskip)
with open("发蒙识字.tex", "w", encoding="utf-8") as f:
    f.write(header + "\n")
    for title, text in sort_dict_with(texts_sz, key="numero"):
        # print(title)
        f.write(text_to_tex_str(text, typesettings=typesettings) + "\n")
        # break
    f.write(footer)

In [189]:
s = "一、二、三、十、木、禾、上、下、土、个、八、入、大、天、人、火、文、六、七、儿、九、无、口、日、中、了、子、门、月、不、开、四、五、目、耳、头、米、见、白、田、电、也、长、山、出、飞、马、鸟、云、公、车、牛、羊、小、少、巾、牙、尺、毛、卜、又、心、风、力、手、水、广、升、足、走、方、半、巴、业、本、平、书、自、已、东、西、回、片、皮、生、里、果、几、用、鱼、今、正、雨、两、瓜、衣、来、年、左、右、万、百、丁、齐、冬、说、友、话、春、朋、高、你、绿、们、花、红、草、爷、亲、节、的、岁、行、古、处、声、知、多、忙、洗、真、认、父、扫、母、爸、写、全、完、关、家、看、笑、着、兴、画、会、妈、合、奶、放、午、收、女、气、太、早、去、亮、和、李、语、秀、千、香、听、远、唱、定、连、向、以、更、后、意、主、总、先、起、干、明、赶、净、同、专、工、才、级、队、蚂、蚁、前、房、空、网、诗、黄、林、闭、童、立、是、我、朵、叶、美、机、她、过、他、时、送、让、吗、往、吧、得、虫、很、河、借、姐、呢、呀、哪、谁、凉、怕、量、跟、最、园、脸、因、阳、为、光、可、法、石、找、办、许、别、那、到、都、吓、叫、再、做、象、点、像、照、沙、海、桥、军、竹、苗、井、面、乡、忘、想、念、王、这、从、进、边、道、贝、男、原、爱、虾、跑、吹、乐、地、老、快、师、短、淡、对、热、冷、情、拉、活、把、种、给、吃、练、学、习、非、苦、常、问、伴、间、共、伙、汽、分、要、没、孩、位、选、北、湖、南、秋、江、只、帮、星、请、雪、就、球、跳、玩、桃、树、刚、兰、座、各、带、坐、急、名、发、成、动、晚、新、有、么、在、变、什、条"
len(s.split("、"))

350

In [166]:
import requests

url = "https://www.zhonghuadiancang.com/wenxueyishu/liyoucaibanhua/150596.html"
url = "https://www.zhonghuadiancang.com/wenxueyishu/liyoucaibanhua/150597.html"
url = "https://www.zhonghuadiancang.com/wenxueyishu/liyoucaibanhua/150598.html"
url = "https://www.99csw.com/book/2780/86125.htm"

res = requests.get(url)

In [None]:
res.content.decode()

## PDF转图像

In [6]:
from pdf2image import convert_from_path
import os

# Function to convert PDF to images
def pdf_to_images(pdf_path, output_folder, filename):
    # Convert PDF to list of PIL.Image
    images = convert_from_path(pdf_path)

    # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # Save each image in the list
    for i, image in enumerate(images):
        image_path = os.path.join(output_folder, f"{filename}_{i + 1}.png")  # Adjust extension as needed
        image.save(image_path, "PNG")  # Adjust format as needed
        print(f"Saved {image_path}")

# Example usage
title = r"洱海一枝春"
pdf_path = r"../语文/单篇课文.pdf"  # Replace with your PDF file path
output_folder = f"../语文/{title}/"  # Replace with desired output folder path
pdf_to_images(pdf_path, output_folder, title)

Saved ../语文/洱海一枝春/洱海一枝春_1.png
Saved ../语文/洱海一枝春/洱海一枝春_2.png
Saved ../语文/洱海一枝春/洱海一枝春_3.png
Saved ../语文/洱海一枝春/洱海一枝春_4.png
Saved ../语文/洱海一枝春/洱海一枝春_5.png
Saved ../语文/洱海一枝春/洱海一枝春_6.png
Saved ../语文/洱海一枝春/洱海一枝春_7.png
