In [1]:
import json
import spacy
from spacy import displacy
# load language model
zh_parser = spacy.load('zh_core_web_lg')## disable=["parser"]

In [2]:
with open("../data/cedict_ts.u8", "r+") as f, open("../data/cedict_ts_reduced.u8", "w+") as f2:
    for line in f.readlines():
        if ("/variant of" not in line) and ("/(bound" not in line):
            f2.write(line)

In [3]:
from cedict_utils.cedict import CedictParser
parser = CedictParser()
parser.read_file("../data/cedict_ts_reduced.u8")
entries = parser.parse()

simple_dict = {x.simplified: {'meaning': x.meanings, 'pinyin': x.pinyin} for x in entries}

In [4]:
def get_book_dict(book_path):

    with open(f"../data/books/{book_path}/book_data.json", "r+") as f:
        book_dict = json.loads(f.read())

    return book_dict

In [5]:
# get character frequency values from here... define a function and call it below
# read in both csv files and write their rows to dictionaries
import csv

def csv_to_dict(filepath):
    d = {}
    with open(filepath, "r") as f:
        csv_file = csv.reader(f)
        for row in csv_file:
            if row[0] == "rank": continue
            d[row[1]] = row[0]
    return d

overall_ranks = csv_to_dict("../most_common_characters_corrected.csv")

In [6]:
def get_frequency(c):
    total_characters = 5000
    rank = overall_ranks.get(c)
    return int(rank)/total_characters if rank else None

In [7]:
get_frequency("是")

0.0004

In [8]:
# takes a paragraph text string and parse it using spacy


def parse_paragraph(p):
    doc = zh_parser(p)
    parsed_tokens = []
    for token in doc:

        # get character data
        parsed_characters = []

        characters = [x for x in token.text]
        for character in characters:

            character_definition = None
            character_pinyin = None
            if simple_dict.get(character):
                character_definition = simple_dict.get(character).get("meaning")
                character_pinyin = simple_dict.get(character).get("pinyin")

            parsed_character = {
                "character": character,
                "definition": character_definition,
                # "occurances": 0,
                "overall_frequency": get_frequency(character),
                "pinyin": character_pinyin,
            }

            parsed_characters.append(parsed_character)

        # get definition from the dictionary defined above
        definition = None
        pinyin = None
        if simple_dict.get(token.text):
            definition = simple_dict.get(token.text).get('meaning')
            pinyin = simple_dict.get(token.text).get('pinyin')


        # parsed token
        parsed_token = {
            "text": token.text,
            "definition": definition,
            "pinyin": pinyin,
            "characters": parsed_characters
        }

        parsed_tokens.append(parsed_token)
    return parsed_tokens

In [18]:
# book_dict = get_book_dict("dark_forest")
# p1 = book_dict.get("chapters")[7].get("paragraphs")[0]

In [15]:
def write_parsed_file_for_chapter(book_path, chapter_number):
    book_dict = get_book_dict(book_path)
    chapter = book_dict["chapters"][chapter_number - 1]

    print(chapter.get("title"))
    paragraphs = chapter.get("paragraphs")

    parsed_paragraphs = []
    for i, p in enumerate(paragraphs):
        parsed_paragraphs.append(parse_paragraph(p))
    chapter["parsed_paragraphs"] = parsed_paragraphs


    with open(f"../data/books/{book_path}/chapters/{chapter_number}.json", "w+") as f:
        json.dump(chapter, f, ensure_ascii=False)

In [18]:
def get_chapter_count_for_book(book_path):
    book_dict = get_book_dict(book_path)
    return len(book_dict["chapters"])

In [22]:
BOOK="three_body"
chapter_count = get_chapter_count_for_book(BOOK)

for chapter in range(1, chapter_count+1):
    write_parsed_file_for_chapter(BOOK, chapter)

后记


In [None]:
BOOK="dark_forest"
chapter_count = get_chapter_count_for_book(BOOK)

for chapter in range(1, chapter_count+1):
    write_parsed_file_for_chapter("dark_forest")

In [None]:
BOOK="deaths_end"
chapter_count = get_chapter_count_for_book(BOOK)

for chapter in range(1, chapter_count+1):
    write_parsed_file_for_chapter("deaths_end")