In [1]:
import json
import os
import matplotlib.pyplot as plt
import numpy as np
from outils import read, keys, load_cn_json, dump_cn_json, 中转数, 数转中, set_char_colors, nice_print, sort_dict_with

def wrap(s, wrapper="{}", keep_wrapper=False):
    if s:
        return wrapper[0] + s + wrapper[-1]
    if keep_wrapper:
        return wrapper
    return ""

def make_params(params, wrapper="[]", sep=","):
    return wrap(sep.join(params), wrapper)

def wrap_env(name, content, params=[], param_wrapper="[]", param_sep=","):
    out = r"\begin" + wrap(name) + make_params(params, wrapper=param_wrapper, sep=param_sep) + "\n"
    lines = content[:-1].split("\n")  # presume content ends with \n
    for line in lines:
        out += "    " + line + "\n"
    out += r"\end" + wrap(name) + "\n"
    return out

def wrap_method(method, content="", wrapper="{}", keep_wrapper=True, params=[], param_wrapper="[]", param_sep=","):
    return '\\' + method + make_params(params, wrapper=param_wrapper, sep=param_sep) + wrap(content, wrapper=wrapper, keep_wrapper=keep_wrapper)

def zihao(n):
    return wrap_method("zihao", str(n))

def package_update_xcolor(packages, texts):
    xcolor = packages["xcolor"]
    xcolor["defined_colors"] = {}
    for _, text in texts.items():
        if "character_colors" in text:
            for key, val in text["character_colors"].items():
                xcolor["defined_colors"][key] = val
    packages["xcolor"] = xcolor

def make_ctex_env(document_class="ctexbook", document_class_params=("12pt", "UTF-8","openany"), packages={"ctex": [], "titlesec": []}, mainfont="Arial", lineskip="4pt", parskip="10pt", title="标题", author="", date=False, toc=True):
    """make header and footer for ctexbook environment. 
    header
    1. documentclass and parameters 
    2. packages
    3. geometry and fonts
    4. package setups
    5. global typesettings
    6. begin document
    footer
    1. end document
    """
    # ## header ##

    # document class
    header = r"\documentclass"+ make_params(document_class_params) + wrap(document_class) + "\n"
    
    # packages
    packages_str = ""
    for name in packages:
        # print(package)
        package_declarations = ""
        if "declarations" in packages[name]:
            package_declarations = make_params(packages[name]['declarations'])
        packages_str += r"\usepackage" + package_declarations + wrap(name) + "\n"
    # print(packages_str)
    header += packages_str + "\n"

    # geometry <-- geometry package
    if "geometry" in packages:
        geometry = packages["geometry"]
        paper_type = geometry["paper_size"]
        paddings = geometry["paddings"]
        left = paddings["left"]
        right = paddings["right"]
        top = paddings["top"]
        bottom = paddings["bottom"]
        header += wrap_method("geometry", f"{paper_type}paper,left={left},right={right},top={top},bottom={bottom}") + "\n"
    
    # fonts
    header += r"\renewcommand{\footnotesize}{\fontsize{8.5pt}{10.5pt}\selectfont}" + "\n"
    header += wrap_method("setmainfont", mainfont) + "\n"
    header += r"\setCJKmainfont[BoldFont=STZhongsong]{汉字之美仿宋GBK 免费}" + "\n"
    header += r"\xeCJKDeclareCharClass{CJK}{`0 -> `9}" + "\n"  # apply CJK font to numbers
    header += r"\xeCJKsetup{AllowBreakBetweenPuncts=true}" + "\n"  # line alignment

    if "footmisc" in packages:
        footnote_settings_content = "".join(["{\ding{"+str(192+i)+"}}" for i in range(10)])
        footnote_settings = wrap_method("DefineFNsymbols", footnote_settings_content, params=["circled"], param_wrapper="{}")
        header += footnote_settings + "\n"
        header += wrap_method("setfnsymbol", "circled") + "\n"

    # package setups
    # xpinyin
    if "xpinyin" in packages:
        pyr = packages['xpinyin']['ratio']  # size ratio
        vsep = packages['xpinyin']['vsep']  # vertical gap
        vsep_str = "vsep={" + vsep + "}"
        hsep = packages['xpinyin']['hsep']  # horizontal gap
        hsep_str = "hsep={" + f"{hsep} plus {hsep}" + "}"
        header += wrap_method("xpinyinsetup", f"ratio={pyr},{hsep_str},{vsep_str}") + "\n"  # pinyin settings

    # hanzibox
    if "hanzibox" in packages:
        hanzibox = packages["hanzibox"]
        frametype = hanzibox['frametype']
        framelinewidth = hanzibox['framelinewidth']
        width = hanzibox['width']
        resize = hanzibox['resize']
        framecolor = hanzibox["framecolor"]
        pinyinline = hanzibox['pinyinline']
        pinyinf = hanzibox['pinyinf']
        pinyincolor = hanzibox['pinyincolor']
        charcolor = hanzibox['charcolor']
        charf = "charf={" + hanzibox["charf"]["font"] + hanzibox["charf"]["fontsize"] + "}"
        header += wrap_method("hanziboxset", f"frametype={frametype},framelinewidth={framelinewidth},width={width},resize={resize},pinyinline={pinyinline},framecolor={framecolor},{charf},pinyinf={pinyinf},pinyincolor={pinyincolor},charcolor={charcolor}") + "\n"  # hanzibox settings

    # package setups
    # xcolor
    if "xcolor" in packages:
        defcolor_str = ""
        for key, (r, g, b) in packages["xcolor"]["defined_colors"].items():
            rgb_plate = f"{r},{g},{b}"
            defcolor_str += wrap_method("definecolor", key) + r"{RGB}{" + rgb_plate + r"}" + "\n"
        header += defcolor_str + "\n"

    # global typesettings
    # title format
    header += r"\titleformat{\chapter}{\zihao{-1}\bfseries}{ }{16pt}{}" + "\n"
    header += r"\titleformat{\section}{\zihao{-2}\bfseries}{ }{0pt}{}" + "\n"
    header += r"\title" + wrap(r"\zihao{0} \bfseries " + title) + "\n"
    # line and paragraph skips
    header += r"\setlength{\lineskip}{" + lineskip + "}\n"  # skip length after line
    header += r"\setlength{\parskip}{" + parskip + "}\n"  # extra skip for paragraphs 
    # front page format
    if author:  # author format
        header += r"\author{\zihao{2} \texttt" + wrap(author) + "}\n"
    else:
        header += r"\author{}" + "\n"
    if date:  # date format
        header += r"\date{\bfseries\today}" + "\n"
    else:
        header += r"\date{}" + "\n"
    
    # begin document
    header += r"\begin" + wrap("document") + "\n"
    header += r"\maketitle" + "\n"
    if toc:
        header += r"\tableofcontents" + "\n"
    header += r"\newpage" + "\n"
    
    # ## footer ##

    # end document
    footer = r"\end" + wrap("document") + "\n"
    return header, footer


In [2]:
# 打印页面设置：纸号，页边距等
geometry = {}
geometry["paper_size"] = "a5"  # 使用A5纸
paddings = {}  # 页边距
paddings["left"] = "1.4cm"
paddings["right"] = "1.4cm"
paddings["top"] = "2.3cm"
paddings["bottom"] = "2.3cm"
geometry["paddings"] = paddings

# 拼音设置： xpinyin宏包
pinyin = {}
pinyin["ratio"] = "0.5"
pinyin["hsep"] = ".6em"
pinyin["vsep"] = "1em"

# 田字格设置：hanzibox宏包
# \hanziboxset{frametype=咪,framelinewidth=0.5pt,width=1.0cm,resize=real,pinyinline=true,framecolor=red,charf={\kaishu\huge},pinyinf=\scriptsize,pinyincolor=green!30!black,charcolor=green!30!black}
hanzibox = {}
hanzibox["frametype"] = "咪"
hanzibox["framelinewidth"] = "0.5pt"
hanzibox["width"] = "0.9cm"
hanzibox["resize"] = "real"
hanzibox["pinyinline"] = "true"
hanzibox["framecolor"] = "red"
hanzibox["pinyinf"] = r"\scriptsize"
hanzibox["charf"] = {"font": r"\kaishu", "fontsize": r"\huge"}
hanzibox["pinyincolor"] = r"green!30!black"
hanzibox["charcolor"] = r"green!30!black"


## 小学

In [3]:
path_xx = "../src/小学/"

# 打印小学古诗（分层）
packages = {}
packages["ctex"] = []
packages["titlesec"] = []
packages["xeCJK"] = []
packages["fontspec,xunicode,xltxtra"] = []
packages["xpinyin"] = pinyin
packages["xpinyin"]['ratio'] = "0.44"
packages["xpinyin"]['hsep'] = ".6em"
packages["geometry"] = geometry
packages["indentfirst"] = []
packages["pifont"] = []
packages["footmisc"] = {"declarations": ["perpage", "symbol*"]}
lineskip = "24pt"
parskip = "6pt"

### 小学诗歌

In [7]:
def shi_to_tex_str(shi, print_genre=False, authors={}, typesettings={"vspaces": {"after_title": 8, "after_author": 6, "after_content": 6}}):
    # convert structured shi to string ready to use in tex
    out = r"\section{" + shi["title"] + "}\n\n"
    content = ""
    # title = wrap_method("textbf", zihao(3) + " " + shi["title"]) + "\n\n"

    # if print_genre:
    #     title = shi["genre"] + "：" + title
    # content += title
    content += wrap_method("vspace", f"{typesettings['vspaces']['after_title']}pt") + "\n\n"
    author_str = ""
    if shi["author"]:
        author = shi["author"]
        if author in authors:
            author_str += "〔唐代：" + author + "〕\n\n"
        else:
            author_str += "〔" + author + "〕\n\n"
    else:
        author_str += "〔作者不详〕\n\n"
    content += wrap_env("normalsize", "\n" + author_str) + "\n"
    content += wrap_method("vspace", f"{typesettings['vspaces']['after_author']}pt") + "\n\n"
    content += wrap_env("large", "\n" + "\n\n".join([wrap_method("xpinyin*", line) for line in shi["content"]]) + "\n\n") + "\n"
    content = wrap_env("center", content) + "\n"
    content += wrap_method("vspace", f"{typesettings['vspaces']['after_content']}pt") + "\n\n"
    out += content
    return out

In [8]:
shis = load_cn_json(os.path.join(path_xx, "古诗.json"))
output_tex = "古诗集.tex"
title = "小学语文古诗集"
# shis = load_cn_json(os.path.join(path_xx, "唐诗三百首.json"))
# output_tex = "唐诗三百首.tex"
# title = "唐诗三百首"

header, footer = make_ctex_env(packages=packages, title=title, parskip=parskip, lineskip=lineskip)

# 分层
shi_by_level = {}
levels = []
for i in range(10):
    levels.append(f"第{数转中[i+1]}层") 
levels.append("其他")
# print(levels)

typesettings = {"vspaces": {"after_title": 10, "after_author": 8, "after_content": 8}}

for title, shi in shis.items():
    level = shi["level"]
    if level not in shi_by_level:
        shi_by_level[level] = {}
    shi_by_level[level][title] = shi

with open(output_tex, "w", encoding="utf-8") as f:
    f.write(header + "\n")
    for level in levels:
        f.write(r"\chapter" + wrap(level) + "\n\n")
        for title, shi in shi_by_level[level].items():
            f.write(shi_to_tex_str(shi, typesettings=typesettings) + "\n")
    f.write(footer)

In [None]:
def xiezi_to_str(zis, ncol=2, nex=3, hspace=1):
    out = "\n\n"
    boxes = ""
    i = 0
    for zi in zis:
        boxes += wrap_method("hanzibox", zi)
        for j in range(nex):
            boxes += wrap_method("hanzibox", "")
        i += 1
        if i == ncol:
            boxes += "\n\n"
            i = 0
        else:
            boxes += wrap_method("hspace", f"{hspace}em")
    out += wrap_env("center", boxes + "\n\n")
    return out

print(xiezi_to_str(texts_sz["一二三四五"]["xiezi"]))

### 小学现代文

In [3]:
def read_text(path, format="散文"):
    """Read raw text and formalize to json
    Inputs: 
    path (str): file path to the raw text.
    format (str): format of the text.
    Output:
    out (dict): a jsonifiable dictionary with formalized text.
    Example:
    out["format"]     : format of the text (in the sense of tex printing).
    out["genre"]      : genre and other tags of the text.
    out["content"]    : content of the text. A list of strings.
    out["grade"]      : recommanded student grade (for the purpose of eduation).
    out["title"]      : title of the text.
    out["author"]     : author of the text.
    out["remarks"]    : remarks concerning the text.
    out["footnotes"]  : footnotes of the content of the text.
    out["endnotes"]   : endnotes of the content of the text.
    out["vocabulary"] : vocabulary to learn (for the purpose of eduation).
    """
    lines = read(path)
    out = {}
    title = ""
    if len(lines) and len(lines[0]):
        author = ""
        grade = 0
        footnotes = []
        endnotes = []
        vocabulary = []
        remarks = []
        content = []
        out["format"] = format
        out["genre"] = [format]
        # return lines
        if format in ("散文", "书信", "小说", "剧本"):
            for line in lines:
                line0 = line.strip()
                if line0:
                    if not title:
                        title = line0
                    elif grade < 1 and line.startswith("年级："):
                        grade = int(line0[3:])
                    elif not author and line.startswith("作者："):
                        author = line0[3:]
                    elif line.startswith("备注："):
                        remarks.append(line0[3:])
                    elif line.startswith("注释："):
                        footnotes.append(line0[3:])
                    elif line.startswith("脚注："):
                        footnotes.append(line0[3:])
                    elif line.startswith("尾注："):
                        endnotes.append(line0[3:])
                    elif line.startswith("词汇："):
                        vocabulary.extend(line0[3:].split())
                    else:
                        content.append(line0)
        elif format == "诗歌":
            para = []
            for line in lines:
                line0 = line.strip()
                if line0:
                    if not title:
                        title = line0
                    elif grade < 1 and line.startswith("年级："):
                        grade = int(line0[3:])
                    elif not author and line.startswith("作者："):
                        author = line0[3:]
                    elif line.startswith("备注："):
                        remarks.append(line0[3:])
                    elif line.startswith("注释："):
                        footnotes.append(line0[3:])
                    elif line.startswith("脚注："):
                        footnotes.append(line0[3:])
                    elif line.startswith("尾注："):
                        endnotes.append(line0[3:])
                    elif line.startswith("词汇："):
                        vocabulary.extend(line0[3:].split())
                    else:
                        para.append(line0)
                elif len(para):
                    content.append("|#|".join(para))
                    para = []
            if len(para):
                content.append("|#|".join(para))
        # make footnotes dict
        footdict = {}
        i = 0
        keybase = "fn"
        content = "@".join(content)
        for note in footnotes:
            word = ""
            if note.startswith("〔"):
                word = note.split("〕")[0][1:]
                key = keybase + str(i+1)
                footdict[key] = note
                i += 1
            elif "〕" in note:  # key is already marked in the text with the format "\apost{a...}".
                key = note.split("〕")[0].split("〔")[0]
                footdict[key] = "".join(note.split(key)[1:])
            # print(word)
            if word:  # find the position to insert footnote and mark
                nfin = content.find(word) + len(word)
                content = content[:nfin] + r"\apost{" + key + "}" + content[nfin:]
        if "|#|" in content:
            content_new = []
            for para in content.split("@"):
                content_new.append(para.split("|#|"))
            content = content_new
        else:
            content = content.split("@")
        
        out["title"] = title
        out["author"] = author
        out["content"] = content
        out["remarks"] = remarks
        out["footnotes"] = footdict
        out["endnotes"] = endnotes
        out["vocabulary"] = vocabulary
        if grade:
            out["grade"] = grade
    return title, out

def text_content_to_tex_str(text, verbose=0, verseprop=0.5, format="散文", footnotes={}, endnotes=[]):
    """convert the content of a text to text string ready for tex.
    the format varies by genre:
    散文、小说
    书信
    诗歌
    剧本
    """ 
    content = text["content"]
    if "footnotes" in text:
        footnotes = text["footnotes"]
    if "format" in text:
        format = text["format"]
    out = ""
    if format in ("散文", "小说",):
        out = "\n\n".join(content) + "\n"
    elif  format == "书信":
        if verbose and not content[0].endswith("："):
            print("错误：第一行不是抬头")
            return "格式错误\n"
        out = r"\noindent " + content[0] + "\n\n" + wrap_method("vspace", "24pt") + "\n\n"
        toright = False
        toright_content = ""
        for line in content[1:]:
            if line:
                if toright:
                    toright_content += line + "\n\n"
                else:
                    out += line + "\n\n"
            else:
                toright = True
        out += wrap_method("vspace", "36pt") + "\n\n"
        out += wrap_env("flushright", toright_content) + "\n\n"
    elif  format == "诗歌":
        if not isinstance(content[0], list):
            content = [content]
        
        lineskip = " \\\\\n"
        # parskip = "\n" + wrap_method("vspace", "4pt") + "\n\n"
        parskip = "\n\n"
        out = parskip.join([wrap_env("verse", lineskip.join(par) + "\n", params=[str(verseprop)+"\\linewidth"]) for par in content])
    elif format == "剧本":
        name_set = text["characters"]
        for line in content:
            if line.startswith("\\item["):
                name = line.split("]")[0][6:]
                colored_name = r"{\color{" + name_set[name] + r"} " + name + r"}"
                out += "\\item[" + colored_name + "]" + "]".join(line.split("]")[1:])
            elif line.startswith("$"):
                colored_line = line
                for name in name_set:
                    colored_line = colored_line.replace(name, r"{\color{" + name_set[name] + r"} " + name + r"}")
                out += colored_line
            else:
                out += line
            out += "\n\n"
    for key in footnotes:
        out = out.replace("apost{"+key+"}", "footnote{" + footnotes[key] + "}")
    return out

def endnotes_to_str(endnotes, verbose=0, pinyin=False):
    """convert the endnotes to text string ready for tex."""
    out = ""
    notes = ""
    for note in endnotes:
        if pinyin and note.startswith("〔"):  # add pinyin
            suite = note[1:].split("〕")
            notes += "\item " + note[0] + r"\xpinyin*{" + suite[0] + r"}〕" + "〕".join(suite[1:]) + "\n"
        else:
            notes += "\item " + note + "\n"
    if notes:
        out = r"\newpage" + "\n\n" + r"\textbf{注释}：" + "\n\n" + r"\vspace{-1em}" + "\n\n"
        out += wrap_env("itemize", r"\setlength\itemsep{-0.2em}" + "\n" + notes)
    return out

def shizi_to_str(zis, n=8):
    out = "\clearpage" + "\n\n"
    boxes = ""
    i = 0
    for zi in zis:
        boxes += wrap_method("hanzibox", zi)
        i += 1
        if i == n:
            boxes += "\n\n"
            i = 0
    out += wrap_env("center", boxes + "\n\n")
    return out

def xiezi_to_str(zis, ncol=2, nex=4, hspace=1):
    out = ""
    boxes = ""
    i = 0
    for zi in zis:
        boxes += wrap_method("hanzibox", zi)
        for j in range(nex):
            boxes += wrap_method("hanzibox", "")
        i += 1
        if i == ncol:
            boxes += "\n\n"
            i = 0
        else:
            boxes += wrap_method("hspace", f"{hspace}em")
    out += boxes + "\n\n"
    # out += wrap_env("center", boxes + "\n\n")
    return out

def text_to_tex_str(text, typesettings={"font": {"title": {"size": 2}, "plaintext": {"size": "normalsize"}}, "vspaces": {"after_title": 12, "after_author": 6, "after_content": 6}}):
    """convert a text object to text string ready for tex
    """
    out = ""
    content = ""
    # title_fontsize = typesettings["font"]["title"]["size"]
    # title = wrap_method("textbf", zihao(title_fontsize) + " " + text["title"]) + "\n"
    title = wrap_method("chapter", text["title"]) + "\n\n"
    content += title
    # content = wrap_env("center", content) + "\n"
    # content += wrap_method("vspace", f"{typesettings['vspaces']['after_title']}pt") + "\n\n"
    content += wrap_env(typesettings["font"]["plaintext"]["size"], "\n" + text_content_to_tex_str(text) + "\n")
    # content += wrap_method("vspace", f"{typesettings['vspaces']['after_content']}pt") + "\n\n"
    out += content + "\n\n"
    # out += wrap_method("newpage", keep_wrapper=False) + "\n\n"
    if "endnotes" in text:
        out += endnotes_to_str(text["endnotes"])
    if "shizi" in text:
        nchars = 10
        if len(text["shizi"]) % 10 == 1:
            nchars = 8
        out += shizi_to_str(text["shizi"], n=nchars) + "\n\n"
        if "xiezi" in text:
            out += xiezi_to_str(text["xiezi"]) + "\n\n"
    return out

def add_text(texts, title, content, format="散文", tags=[]):
    """Add a text to the dictionary of texts.
    Inputs:
    texts (dict): dictionary of texts. title --> content.
    title (str): title of the text.
    content (dict): content of the text.
    format (str): format of the text.
    tags (list of str): tags to describe the text.
    Output:
    texts: updated dictionary of texts. 
    """
    if len(tags):
        content["genre"] = tags
    if format == "剧本":
        if title not in texts:
            script_keys = []
            for _, text in texts.items():
                if text["format"] == "剧本" and "key" in text:
                    script_keys.append(int(text["key"].split("-")[1]))
            if len(script_keys):
                script_key = "script-" + str(max(script_keys) + 1)
            else:
                script_key = "script-1"
                
        else:
            script_key = texts[title]["key"]
        name_set, color_set = set_char_colors(content["content"], script_key)
        
    texts[title] = content
    if format == "剧本":
        texts[title]["key"] = script_key
        texts[title]["characters"] = name_set
        texts[title]["character_colors"] = color_set
    return texts


In [4]:
packages = {}
packages["ctex"] = []
packages["titlesec"] = []
packages["xeCJK"] = []
packages["verse"] = []
packages["fontspec,xunicode,xltxtra"] = []
packages["xpinyin"] = pinyin
packages["geometry"] = geometry
packages["indentfirst"] = []
packages["pifont"] = []
packages["footmisc"] = {"declarations": ["perpage", "symbol*"]}
lineskip = "24pt"
parskip = "6pt"

In [137]:
texts_xx = load_cn_json("../src/小学/阅读课文.json")

text_format = "诗歌"
tags = ["诗歌", "抒情"]
title, content = read_text("草稿.tex", format=text_format)

if title:
    print(f"新增课文：{title}")
    texts = add_text(texts_xx, title, content, text_format, tags)
    dump_cn_json("../src/小学/阅读课文.json", texts_xx)

新增课文：我为少男少女们歌唱


In [5]:
texts_xx = load_cn_json("../src/小学/阅读课文.json")

booktitle = "小学语文课文集萃"
header, footer = make_ctex_env(packages=packages, title=booktitle, parskip=parskip, lineskip=lineskip)
typesettings = {}
typesettings["vspaces"] = {"after_title": 36, "after_author": 16, "after_content": 16}
typesettings["font"] = {"plaintext": {"size": "large"}}

with open("小学现代文阅读课文.tex", "w", encoding="utf-8") as f:
    f.write(header + "\n")
    for title, text in sort_dict_with(texts_xx):
        # print(title)
        f.write(text_to_tex_str(text, typesettings=typesettings) + "\n")
    f.write(footer)

In [139]:
texts_xx = load_cn_json("../src/小学/阅读课文.json")

grade_count = {}
title_by_grade = {}
title_by_genre = {}
genre_by_grade = {}
for title, text in texts_xx.items():
    if "grade" not in text:
        text["grade"] = 1
    g = text["grade"]
    if g not in grade_count:
        grade_count[g] = 0
    if g not in title_by_grade:
        title_by_grade[g] = []
    if g not in genre_by_grade:
        genre_by_grade[g] = {}
    for genre in text["genre"]:
        if genre not in genre_by_grade[g]:
            genre_by_grade[g][genre] = []
        if genre not in title_by_genre:
            title_by_genre[genre] = []
    grade_count[text["grade"]] += 1
    title_by_grade[text["grade"]].append(title)
    for genre in text["genre"]:
        genre_by_grade[g][genre].append(title)
        title_by_genre[genre].append(title)

for g in title_by_grade:
    print(f"{g} 年级 ({len(title_by_grade[g])})")
    nice_print(title_by_grade[g])

2 年级 (16)
['狼和小羊', '翠鸟', '揠苗助长', '守株待兔', '初冬']
['秋天', '坐井观天', '骆驼和羊', '狐狸和乌鸦', '曹冲称象']
['乌鸦喝水', '狐狸和公鸡', '老狼分饼', '叶公好龙', '十二月花名歌']
['画蛇添足']
3 年级 (48)
['茅以升立志造桥', '美丽的小兴安岭', '大海的歌', '让我们荡起双桨', '小马过河']
['刻舟求剑 ', '八角楼上', '赵州桥', '南京长江大桥', '雨']
['放风筝', '荷花', '掩耳盗铃', '自相矛盾', '滥竽充数']
['惊弓之鸟', '绿色的办公室', '黄继光', '颐和园', '五彩池']
['青蛙的眼睛', '爬山虎的脚', '课间十分钟', '日出', '捞铁牛']
['纸上谈兵', '趵突泉', '鸟的天堂', '桂林山水', '天安门广场']
['火烧云', '卢沟桥的狮子', '海上日出', '董存瑞舍身炸碉堡', '十里长街送总理']
['狐狸和山羊', '燕子', '晏子使楚', '狼牙山五壮士', '我的战友邱少云']
['草原', '马踏飞燕', '伏尔加河上的纤夫', '牛郎织女的故事', '搭船的鸟']
['狐假虎威', '塞翁失马', '买椟还珠']
4 年级 (54)
['我和企鹅', '白求恩大夫（节选改编）', '我的弟弟“小萝卜头”', '帐篷', '参观人民大会堂']
['海底世界', '故乡的杨梅', '杏儿熟了', '春蚕', '李时珍']
['画杨桃', '珍贵的教科书', '爸爸和书', '小珊迪', '劳动最有滋味']
['花生花', '种子', '观潮', '高大的皂荚树', '海滨小城']
['蝙蝠和雷达', '各种各样的玻璃', '糖画', '西门豹', '中国石']
['古井', '峨眉道上', '太阳', '绿叶', '九寨沟']
['兵马俑', '冬眠', '七月的天山', '小英雄雨来', '参观刘家峡水电站']
['小站', '挑山工', '可爱的草塘', '雪猴', '鲸']
['圆明园的毁灭', '喂药（汤姆索亚历险记节选）', '阁楼（小公主节选）', '冀中的地道战', '草船借箭']
['田忌赛马', '记金华的双龙洞', '丰碑', '镜泊湖奇观', '伟大

## 中学

In [135]:
texts_xx = load_cn_json("../src/小学/阅读课文.json")
texts_cz = load_cn_json("../src/中学/阅读课文.json")

titles1 = set(list(texts_xx.keys()))
titles2 = set(list(texts_cz.keys()))

titles1 & titles2

{'猫'}

In [302]:
nice_print(list(titles2))

['绿', '关于圆明园的一封信', '断章', '未选择的路', '老人与海']
['看云识天气', '记一辆纺车', '从百草园到三味书屋', '藤野先生', '谁是最可爱的人']
['反对党八股', '纪念白求恩', '变色龙', '雨巷', '播种季的傍晚']
['荷塘月色', '繁星', '回忆我的母亲', '洱海一枝春', '怀疑与学问']
['中国石拱桥', '向沙漠进军', '范进中举', '中国人失掉自信力了吗', '海燕']
['土地的誓言', '包身工', '阿Q正传', '“友邦惊诧”论', '反对自由主义']
['畏惧错误就是毁灭进步', '假如生活欺骗了你', '愚公移山', '我用残损的手掌', '船夫曲']
['天上的街市', '消息两则', '雄伟的人民大会堂', '一', '国王的新衣']
['理想的阶梯', '白求恩传', '中国人民寻求救国真理的道路', '在《人民报》创刊周年纪念会上的演说', '祝福']
['孔乙己', '匆匆', '发问的精神', '批评与自我批评', '求雨']
['假使我们不去打仗', '大卫·科波菲尔', '夜', '社戏', '桨声灯影里的秦淮河']
['紫藤萝瀑布', '我的老师', '聪明人和傻子和奴才', '老山界', '最后一次的讲演']
['快乐王子', '故乡', '错误', '拿来主义', '三年以后']
['树', '竞选州长', '论雷峰塔的倒掉', '生命的意义', '故都的秋']
['威尼斯商人', '在烈日和暴雨下', '《呐喊》自序', '我的叔叔于勒', '乡愁']
['万紫千红的花', '套中人', '驿路梨花', '鲁提辖拳打镇关西', '致杨振宁']
['花市', '生物的入侵者', '太空一日', '漫谈无理数', '静寂的园子']
['麦琪的礼物', '桥', '复活', '食物从何处来', '葫芦僧判断葫芦案']
['车库里的龙', '白杨礼赞', '大自然的语言', '墙上的斑点', '春']
['药', '东方红一号发射', '刘胡兰', '阿长与山海经', '应有真正的格物致知精神']
['我爱这土地', '背影', '记念刘和珍君', '卧看牵牛织女星', '猫']
['最后一课', '挖荠菜', '雷雨', '林教头风雪山神庙', '苏州园林']


### 现代文

In [4]:
packages = {}
packages["ctex"] = []
packages["titlesec"] = []
packages["xeCJK"] = []
packages["verse"] = []
packages["fontspec,xunicode,xltxtra"] = []
packages["xpinyin"] = pinyin
packages["geometry"] = geometry
packages["indentfirst"] = []
packages["pifont"] = []
packages["enumitem"] = []
packages["footmisc"] = {"declarations": ["perpage", "symbol*"]}
xcolor = {}
xcolor["declarations"] = ["table", "dvipsnames"]
packages["xcolor"] = xcolor

typesettings = {}
typesettings["vspaces"] = {"after_title": 36, "after_author": 16, "after_content": 16}
typesettings["font"] = {"plaintext": {"size": "normalsize"}}

In [437]:
def sort_zs(path):
    lines = read("草稿.tex")

    out = []
    zs = False
    for line in lines:
        if line[0] in "0987654321":
            zs = True
            continue
        if zs:
            parts = line.split("：")
            out.append(f"注释：〔{parts[0]}〕" + "：".join(parts[1:]))
            zs = False
        else:
            newline ="".join([w for w in line if w not in "0987654321"])
            out.append(newline)

    with open("草稿.tex", "w", encoding="utf-8") as f:
        f.writelines(out)

In [6]:
texts_cz = load_cn_json("../src/中学/阅读课文.json")

text_format = "散文"
tags = ["散文", "抒情", "人物"]
title, content = read_text("草稿.tex", format=text_format)

if title:
    print(f"新增课文：{title}")
    texts = add_text(texts_cz, title, content, text_format, tags)
    dump_cn_json("../src/中学/阅读课文.json", texts_cz)

新增课文：山地回忆


In [12]:
booktitle = "中学语文课文集萃"
texts_cz = load_cn_json("../src/中学/阅读课文.json")

lineskip = "24pt"
parskip = "6pt"
package_update_xcolor(packages, texts_cz)
header, footer = make_ctex_env(packages=packages, title=booktitle, parskip=parskip, lineskip=lineskip)
with open("中学现代文阅读课文.tex", "w", encoding="utf-8") as f:
    f.write(header + "\n")
    for title, text in sort_dict_with(texts_cz):
        # print(title)
        f.write(text_to_tex_str(text, typesettings=typesettings) + "\n")
        # break
    f.write(footer)

In [295]:
grade = 9
print([(g, grade_count[g]) for g in range(7, 13)])
print(sum([grade_count[g] for g in range(7, 13)]))
nice_print(title_by_grade[grade])

[(7, 26), (8, 26), (9, 26), (10, 27), (11, 14), (12, 2)]
121
['土地的誓言', '未选择的路', '东方红一号发射', '最后一课', '花市']
['藤野先生', '孔乙己', '鲁提辖拳打镇关西', '中国人民寻求救国真理的道路', '《农村调查》序言']
['竞选州长', '我的叔叔于勒', '葫芦僧判断葫芦案', '卧看牵牛织女星', '变色龙']
['威尼斯商人', '夜', '拿来主义', '快乐王子', '多收了三五斗']
['乡愁', '错误', '白求恩传', '聪明人和傻子和奴才', '求雨']
['船夫曲']


In [6]:
for title, text in texts_cz.items():
    if text["author"] == "鲁迅":
        if "节选" in text["genre"]:
            print(title+ "（节选）")
        else:
            print(title)

从百草园到三味书屋
阿长与山海经
论雷峰塔的倒掉
“友邦惊诧”论
社戏
故乡
藤野先生
孔乙己
中国人失掉自信力了吗
拿来主义
祝福
聪明人和傻子和奴才
记念刘和珍君
《呐喊》自序
药
阿Q正传（节选）


In [125]:
texts_xx = load_cn_json("../src/小学/阅读课文.json")
grade_count = {}
title_by_grade = {}
title_by_genre = {}
genre_by_grade = {}
for title, text in texts_xx.items():
    if "grade" not in text:
        text["grade"] = 1
    g = text["grade"]
    if g not in grade_count:
        grade_count[g] = 0
    if g not in title_by_grade:
        title_by_grade[g] = []
    if g not in genre_by_grade:
        genre_by_grade[g] = {}
    for genre in text["genre"]:
        if genre not in genre_by_grade[g]:
            genre_by_grade[g][genre] = []
        if genre not in title_by_genre:
            title_by_genre[genre] = []
    grade_count[text["grade"]] += 1
    title_by_grade[text["grade"]].append(title)
    for genre in text["genre"]:
        genre_by_grade[g][genre].append(title)
        title_by_genre[genre].append(title)

res = [(k,len(v)) for k, v in title_by_genre.items()]

# alist = res
def printsort_int(alist, rev=False):
    ma = max([b for (_, b) in alist])
    tem = [[] for _ in range(ma+1)]
    for (a, b) in alist:
        tem[b].append(a)
    
    if rev:
        for i, a in enumerate(tem[::-1]):
            if len(a):
                print(ma-i, a)
    else:
        for i, a in enumerate(tem):
            if len(a):
                print(i, a)

printsort_int(res, True)

85 ['记叙文']
58 ['散文']
26 ['说明文']
25 ['寓言']
23 ['描写文']
21 ['小说']
19 ['回忆']
18 ['文言文翻译']
15 ['地方介绍']
14 ['人物', '成语故事']
12 ['科普', '报告文学']
10 ['游记']
9 ['名人故事', '抒情']
6 ['诗歌']
5 ['事物介绍']
4 ['动物', '纪实文学', '纪实']
3 ['人物介绍', '借事说理', '议论文', '经典']
2 ['借物喻理', '书信', '童话', '写景', '描写']
1 ['古文翻译', '笔记', '借物抒情', '科幻', '神话传说', '景物', '幻想', '名著', '节选', '声明', '应用文', '言志', '民俗', '植物', '时令']


In [294]:
# [title for title in texts_cz if texts_cz[title]['author'] == "鲁迅"]
texts_cz = load_cn_json("../src/中学/阅读课文.json")
grade_count = {}
title_by_grade = {}
title_by_genre = {}
genre_by_grade = {}
for title, text in texts_cz.items():
    g = text["grade"]
    if g not in grade_count:
        grade_count[g] = 0
    if g not in title_by_grade:
        title_by_grade[g] = []
    if g not in genre_by_grade:
        genre_by_grade[g] = {}
    for genre in text["genre"]:
        if genre not in genre_by_grade[g]:
            genre_by_grade[g][genre] = []
        if genre not in title_by_genre:
            title_by_genre[genre] = []
    grade_count[text["grade"]] += 1
    title_by_grade[text["grade"]].append(title)
    for genre in text["genre"]:
        genre_by_grade[g][genre].append(title)
        title_by_genre[genre].append(title)

res = [(k,len(v)) for k, v in title_by_genre.items()]

# alist = res
def printsort_int(alist, rev=False):
    ma = max([b for (_, b) in alist])
    tem = [[] for _ in range(ma+1)]
    for (a, b) in alist:
        tem[b].append(a)
    
    if rev:
        for i, a in enumerate(tem[::-1]):
            if len(a):
                print(ma-i, a)
    else:
        for i, a in enumerate(tem):
            if len(a):
                print(i, a)

printsort_int(res, True)

28 ['小说']
27 ['散文']
26 ['经典']
21 ['抒情']
17 ['议论文']
16 ['诗歌', '节选', '浪漫主义']
14 ['自然', '回忆', '说明文']
12 ['现实主义']
11 ['科普']
10 ['人物']
9 ['景物', '演讲', '批判现实主义']
8 ['叙事']
7 ['记叙文', '倡议', '论述']
6 ['社会百态']
5 ['纪实', '意象']
4 ['四季', '讽刺', '建筑', '鼓动', '象征主义']
3 ['名著']
2 ['童话', '借物抒情', '动物', '报告文学', '书信', '驳论', '议论', '想象', '序言', '游记', '意识流']
1 ['悼词', '抗议', '友情', '纪实文学', '刘胡兰', '植物', '亲情', '提出问题', '戏剧', '墙头诗', '新闻稿', '对话录', '科学', '推理', '辩论', '批评', '话剧', '传记', '寓言', '纪念', '都市童话', '插叙', '回信', '说理', '阐述']


In [248]:
# nice_print(title_by_genre["自然"])
grade_count

{7: 43, 8: 36, 9: 24, 10: 15, 12: 2, 11: 1}

In [7]:
def get_content(text):
    out = ''
    content = text['content']
    if text['format'] == '诗歌':
        for block in content:
            for line in block:
                out += line
    else:
        for line in content:
            out += line
    return out

In [210]:
texts_cz = load_cn_json("../src/小学/阅读课文.json")

for title, text in texts_cz.items():
    if text["format"] == "诗歌":
        continue
    footnotes = text['footnotes']
    foots_new = {}
    keybase = "fn"
    content = text['content']
    content_new = []
    s = get_content(text)
    if s.find("footnote") < 0:  # no foonote numerated
        foots_new = {}
        i = 0
        out = "|#|".join(text["content"])
        for note in footnotes:
            word = ""
            if note.startswith("〔"):
                word = note.split("〕")[0][1:]
                key = keybase + str(i+1)
                foots_new[key] = note
                i += 1
            elif "〕" in note:  # key is already marked in the text with the format "\apost{a...}".
                key = note.split("〕")[0].split("〔")[0]
                foots_new[key] = "".join(note.split(key)[1:])
            # print(word)
            if word:  # find the position to insert footnote and mark
                nfin = out.find(word) + len(word)
                out = out[:nfin] + r"\apost{" + key + "}" + out[nfin:]
        content_new = out.split("|#|")
    else:  # footnote numerated
        for i, note in enumerate(footnotes):
            key = keybase + str(i+1)
            foots_new[key] = note
        i = 0  # counter for line
        j = 0  # counter for note
        line = content[i]
        while i < len(content) and j < len(footnotes):
            if line.find("footnote{"+str(j+1)+"}") < 0:  # if you cannot find a note in this line
                content_new.append(line)  # get to original line
                i += 1
                line = content[i]  # load the next line
            else:  # if you find a note in this line 
                key = keybase + str(j+1)
                line = line.replace("footnote{"+str(j+1)+"}", "apost{" + key + "}")  # replace
                j += 1  # move to the next note
        content_new.append(line)
        i += 1
        while i < len(content):
            line = content[i]
            content_new.append(line)
            i += 1            
            
    text['footnotes'] = foots_new
    text['content'] = content_new

In [213]:
dump_cn_json("../src/小学/阅读课文.json", texts_cz)

In [218]:
texts = load_cn_json("../src/小学/阅读课文.json")

for title, text in texts.items():
    if isinstance(text["footnotes"], list):
        text["footnotes"] = {}

dump_cn_json("../src/小学/阅读课文.json", texts)

## 识字

In [6]:
import matplotlib.pyplot as plt

texts_sz = load_cn_json("../src/小学/发蒙识字.json")

# arrange the texts by numero
nx = np.sort(np.array([text['numero'] for idx, text in texts_sz.items()]))
texts_nu = []
for i in nx:
    for idx, text in texts_sz.items():
        if text['numero'] == i:
            texts_nu.append(text)

In [46]:
texts_sz = load_cn_json("../src/小学/发蒙识字.json")

for title, text in texts_sz.items():
    text["format"] = "散文"
    text["footnotes"] = {}
    text["endnotes"] = []

# dump_cn_json("../src/小学/发蒙识字.json", texts_sz)

In [53]:
for text in texts_nu:
    print(text["numero"], len(text["xiezi"]))

10 3
20 15
30 11
40 10
50 10
60 8
70 7
80 6
90 7
100 6
110 11
120 8
130 6
140 9
150 8
160 7
170 4
175 7
180 7
190 5
200 8
210 8
220 6
230 7
240 4
250 7
260 7
270 7
280 6
285 8
290 7
300 8
350 9
370 8
390 8
400 6


In [7]:
ziji = {}
nz = 0
zinumbs = []
zicount = []
for text in texts_nu:
    zis = []
    for zi in text['shizi']:
        if zi not in ziji:
            ziji[zi] = 0
            zis.append(zi)
    n = len(zis)
    nz += n
    zinumbs.append(n)
    zicount.append(nz)
    print(text['title'], nz, zis)

zinumbs = np.array(zinumbs)
zicount = np.array(zicount)
# print(zinumbs)
# print(zicount)


我是中国人 9 ['中', '人', '文', '我', '上', '学', '爱', '国', '了']
一二三四五 28 ['一', '二', '三', '四', '五', '金', '木', '水', '火', '土', '天', '地', '日', '月', '分', '见', '下', '今', '古']
人 38 ['头', '面', '身', '手', '足', '口', '舌', '目', '耳', '心']
田 48 ['山', '川', '风', '云', '雨', '田', '力', '禾', '苗', '实']
比大小 58 ['六', '七', '八', '九', '十', '比', '大', '小', '多', '少']
山村 72 ['石', '不', '青', '鸡', '犬', '叫', '农', '牛', '马', '闻', '村', '肥', '路', '归']
开门 80 ['有', '开', '门', '爸', '妈', '在', '个', '只']
你我他 86 ['生', '是', '你', '他', '也', '们']
鸟鱼虫 93 ['爪', '尾', '巴', '毛', '羽', '吃', '没']
田鸟 102 ['黄', '里', '麦', '鸟', '来', '飞', '吹', '点', '去']
山羊 106 ['羊', '草', '谁', '的']
方向 121 ['方', '早', '向', '太', '阳', '走', '边', '前', '后', '左', '右', '东', '南', '西', '北']
我有一个家 131 ['儿', '子', '男', '女', '哥', '弟', '姐', '妹', '和', '家']
看地图 136 ['外', '出', '入', '看', '图']
工农兵 150 ['工', '兵', '士', '民', '量', '好', '习', '起', '团', '结', '保', '卫', '世', '界']
张开口 157 ['张', '牙', '几', '颗', '间', '嘴', '唇']
这是什么 161 ['什', '么', '这', '那']
你吃什么 170 ['兔', '虫', '猴', '猫', '捉', '桃', '花', '黑', '白']

In [None]:
plt.plot(zinumbs)

In [8]:
s7 = load_cn_json("simple700.json")
f9 = load_cn_json("frequent1000.json")

nin = []
n = 0
for lv in f9[:7]:
    ni = []
    n += 1
    for zi in lv:
        if zi in ziji:
            pass
        else:
            ni.append(zi)
    nin.append(ni)
    print("level", n, "total", len(lv), "rest", len(ni))
    if ni:
        nice_print(ni, n=10)

level 1 total 5 rest 0
level 2 total 12 rest 0
level 3 total 24 rest 3
['着', '她', '以']
level 4 total 38 rest 11
['而', '能', '对', '于', '之', '都', '如', '事', '第', '样']
['作']
level 5 total 61 rest 31
['总', '无', '情', '己', '但', '些', '所', '同', '又', '意']
['期', '经', '回', '位', '因', '很', '给', '法', '斯', '次']
['者', '已', '亲', '其', '进', '此', '话', '常', '与', '正']
['感']
level 6 total 92 rest 47
['理', '尔', '定', '本', '特', '做', '孩', '相', '将', '全']
['信', '重', '机', '每', '并', '别', '真', '新', '才', '便']
['夫', '部', '像', '眼', '等', '体', '却', '主', '利', '受']
['表', '德', '克', '代', '员', '许', '零', '由', '死', '安']
['写', '性', '或', '难', '望', '教', '命']
level 7 total 148 rest 89
['更', '拉', '神', '记', '处', '让', '母', '父', '应', '平']
['报', '关', '放', '至', '认', '接', '告', '内', '英', '军']
['候', '岁', '度', '带', '解', '任', '原', '变', '通', '师']
['立', '象', '失', '满', '战', '格', '音', '轻', '条', '呢']
['病', '达', '完', '求', '化', '业', '思', '非', '罗', '钱']
['积', '吗', '语', '元', '喜', '曾', '离', '科', '言', '欢']
['约', '各', '即', '指', '反', '题', '必', '该', '论', '交']

In [12]:
packages = {}
packages["ctex"] = []
packages["titlesec"] = []
packages["xeCJK"] = []
packages["verse"] = []
packages["fontspec,xunicode,xltxtra"] = []
packages["xpinyin"] = pinyin
packages["hanzibox"] = hanzibox
packages["geometry"] = geometry
packages["indentfirst"] = []
packages["pifont"] = []
packages["enumitem"] = []
packages["footmisc"] = {"declarations": ["perpage", "symbol*"]}
xcolor = {}
xcolor["declarations"] = ["table", "dvipsnames"]
packages["xcolor"] = xcolor

typesettings = {}
typesettings["vspaces"] = {"after_title": 36, "after_author": 16, "after_content": 16}
typesettings["font"] = {"plaintext": {"size": "large"}}

booktitle = "发蒙识字"
texts_sz = load_cn_json("../src/小学/发蒙识字.json")

lineskip = "24pt"
parskip = "6pt"
package_update_xcolor(packages, texts_sz)
header, footer = make_ctex_env(packages=packages, title=booktitle, parskip=parskip, lineskip=lineskip)
with open("发蒙识字.tex", "w", encoding="utf-8") as f:
    f.write(header + "\n")
    for title, text in sort_dict_with(texts_sz, key="numero"):
        # print(title)
        f.write(text_to_tex_str(text, typesettings=typesettings) + "\n")
        # break
    f.write(footer)

In [23]:
import requests

url = "https://www.zhonghuadiancang.com/wenxueyishu/liyoucaibanhua/150596.html"
url = "https://www.zhonghuadiancang.com/wenxueyishu/liyoucaibanhua/150597.html"
url = "https://www.zhonghuadiancang.com/wenxueyishu/liyoucaibanhua/150598.html"
url = "https://www.zhonghuadiancang.com/wenxueyishu/liyoucaibanhua/150599.html"

res = requests.get(url)

In [24]:
res.content.decode()

'<!DOCTYPE html>\r\n<html lang="zh-cn">\r\n<head>\r\n<meta charset="utf-8">\r\n<meta http-equiv="X-UA-Compatible" content="IE=edge">\r\n<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1, user-scalable=no">\r\n<title>十 板人作总结_李有才板话_赵树理_在线阅读_中华典藏</title>\r\n<meta name="keywords" content="十 板人作总结,李有才板话,赵树理" />\r\n<meta name="description" content="老杨同志跟区干部们因为晚上多谈了一会话，第二天醒得迟了一点。他们一醒来，听着村里地里到处喊叫，起先还以为出了什么事，仔细一听，才知道是唱不是喊。老杨同志是本地人，一听就懂，便向大家道：&amp;ldquo;你听老百姓今天这股高兴劲儿！&amp;lsquo;干梆戏&#039;唱得多么喧！&amp;rdquo;（这地方把不打乐器的清唱" />\r\n<meta http-equiv="Cache-Control" content="no-siteapp" />\r\n<meta http-equiv="Cache-Control" content="no-transform" />\r\n<link rel="stylesheet" type="text/css" href="https://www.zhonghuadiancang.com/css/bootstrap.min.css">\r\n<link rel="stylesheet" type="text/css" href="https://www.zhonghuadiancang.com/css/read.css">\r\n<script type="text/javascript" src="https://www.zhonghuadiancang.com/js/waps.js"></script>\r\n\r\n</head>\r\n<body>\r\n\r\n<