In [23]:
import json
import spacy
from spacy import displacy
# load language model
zh_parser = spacy.load('zh_core_web_lg')## disable=["parser"]

In [24]:
with open("../data/cedict_ts.u8", "r+") as f, open("../data/cedict_ts_reduced.u8", "w+") as f2:
    for line in f.readlines():
        if ("/variant of" not in line) and ("/(bound" not in line):
            f2.write(line)

In [25]:
from cedict_utils.cedict import CedictParser
parser = CedictParser()
parser.read_file("../data/cedict_ts_reduced.u8")
entries = parser.parse()

simple_dict = {x.simplified: {'meaning': x.meanings, 'pinyin': x.pinyin} for x in entries}

In [26]:
def get_book_dict(book_path):

    with open(f"../data/books/{book_path}/book_data.json", "r+") as f:
        book_dict = json.loads(f.read())

    return book_dict

In [27]:
# get character frequency values from here... define a function and call it below
# read in both csv files and write their rows to dictionaries
import csv

def csv_to_dict(filepath):
    d = {}
    with open(filepath, "r") as f:
        csv_file = csv.reader(f)
        for row in csv_file:
            if row[0] == "rank": continue
            d[row[1]] = row[0]
    return d

overall_ranks = csv_to_dict("../most_common_characters_corrected.csv")

In [28]:
def get_frequency(c):
    total_characters = 5000
    rank = overall_ranks.get(c)
    return int(rank)/total_characters if rank else None

In [29]:
get_frequency("是")

0.0004

In [30]:
# takes a paragraph text string and parse it using spacy


def parse_paragraph(p):
    doc = zh_parser(p)
    parsed_tokens = []
    for token in doc:

        # get character data
        parsed_characters = []

        characters = [x for x in token.text]
        for character in characters:

            character_definition = None
            character_pinyin = None
            if simple_dict.get(character):
                character_definition = simple_dict.get(character).get("meaning")
                character_pinyin = simple_dict.get(character).get("pinyin")

            parsed_character = {
                "character": character,
                "definition": character_definition,
                # "occurances": 0,
                "overall_frequency": get_frequency(character),
                "pinyin": character_pinyin,
            }

            parsed_characters.append(parsed_character)

        # get definition from the dictionary defined above
        definition = None
        pinyin = None
        if simple_dict.get(token.text):
            definition = simple_dict.get(token.text).get('meaning')
            pinyin = simple_dict.get(token.text).get('pinyin')


        # parsed token
        parsed_token = {
            "text": token.text,
            "definition": definition,
            "pinyin": pinyin,
            "characters": parsed_characters
        }

        parsed_tokens.append(parsed_token)
    return parsed_tokens

In [31]:
# book_dict = get_book_dict("dark_forest")
# p1 = book_dict.get("chapters")[7].get("paragraphs")[0]

In [32]:
def write_parsed_file_for_chapter(book_path, chapter_number):
    book_dict = get_book_dict(book_path)
    chapter = book_dict["chapters"][chapter_number - 1]

    print(chapter.get("title"))
    paragraphs = chapter.get("paragraphs")

    parsed_paragraphs = []
    for i, p in enumerate(paragraphs):
        # skip over empty paragraphs
        if p != "" and p.strip() != "":
            parsed_paragraphs.append(parse_paragraph(p))
    chapter["parsed_paragraphs"] = parsed_paragraphs


    with open(f"../data/books/{book_path}/chapters/{chapter_number}.json", "w+") as f:
        json.dump(chapter, f, ensure_ascii=False)

In [33]:
def get_chapter_count_for_book(book_path):
    book_dict = get_book_dict(book_path)
    return len(book_dict["chapters"])

In [34]:
BOOK="three_body"
chapter_count = get_chapter_count_for_book(BOOK)

for chapter in range(1, chapter_count+1):
    write_parsed_file_for_chapter(BOOK, chapter)

写在“基石”之前
前 言
1.疯狂年代
2.寂静的春天
3.红岸之一
4.科学边界
5.台球
6.射手和农场主
7.三体、周文王、长夜
8.叶文洁
9.宇宙闪烁
10.大史
11.三体、墨子、烈焰
12.红岸之二
14.红岸之四
15.三体、哥白尼、宇宙橄榄球、三日凌空
16.三体问题
17.三体、牛顿、冯·诺依曼、秦始皇、三日连珠
18.聚会
19.三体、爱因斯坦、单摆、大撕裂
20.三体、远征
21.地球叛军
22.红岸之五
23.红岸之六
24.叛乱
25.雷志成、杨卫宁之死
26.无人忏悔
27.伊文斯
29.地球三体运动
30.两个质子
31.古筝行动
32.监听员
33.智子
34.虫子
35.尾声、遗址
后记


In [35]:
BOOK="dark_forest"
chapter_count = get_chapter_count_for_book(BOOK)

for chapter in range(1, chapter_count+1):
    write_parsed_file_for_chapter(BOOK, chapter)

TypeError: write_parsed_file_for_chapter() missing 1 required positional argument: 'chapter_number'

In [None]:
BOOK="deaths_end"
chapter_count = get_chapter_count_for_book(BOOK)

for chapter in range(1, chapter_count+1):
    write_parsed_file_for_chapter("deaths_end")