In [None]:
import pandas as pd
from lxml import html

from mdict_analysis.readmdict import MDX

mdx = MDX("The BNC-COCA Lists/the-bnc-coca-lists.mdx")
words = []
labels = []
labels_preq = []
total_freqs = []

for key, value in mdx.items():
    word = key.decode()
    words.append(word)
    page = value.decode("utf-8")
    page_tree = html.fromstring(page)
    try:
        label = page_tree.xpath("//h1/span[@class='label']")[0].text
    except:
        label = None
    labels.append(label)
    try:
        label_freq = page_tree.xpath("//h1/span[contains(@class, 'freq-level')]")[
            0
        ].text
    except:
        label_freq = None
    labels_preq.append(label_freq)
    try:
        total_freq = page_tree.xpath("//p[@class='word-frequency']")[0].text.replace(
            "Total Frequency: ", ""
        )
    except:
        total_freq = None
    total_freqs.append(total_freq)

df = pd.DataFrame(
    dict(
        word=words,
        label=labels,
        label_freq=labels_preq,
        total_freq=total_freqs,
    )
)
df

In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
import spacy

nlp = spacy.load("en_core_web_trf", disable=["parser", "ner"])
word_freq_df = pd.read_csv("bnc_coca_word_freq.csv")


def process_tokens(doc):
    return [
        token.lemma_.lower()
        for token in doc
        if not token.is_stop
        and not token.is_punct
        and token.is_alpha
        and len(token.text) > 2
        and token.pos_ != "PROPN"
    ]


def process_tokens_and_info(doc):
    return [
        (token.lemma_.lower(), token.pos_)
        for token in doc
        if not token.is_stop
        and not token.is_punct
        and token.is_alpha
        and len(token.text) > 2
        and token.pos_ != "PROPN"
    ]


def df_to_normalized_vector(df, max_k=30):
    """
    将词汇分布DataFrame转为归一化向量
    :param df: 输入DataFrame (index='1k','2k'...; columns='count')
    :param max_k: 最大频率区间 (默认到30k)
    :return: 归一化向量 (numpy array)
    """
    # 创建全0向量 (长度=max_k)
    vector = np.zeros(max_k)

    # 填充有效数据
    for label, count in df["count"].items():
        k_val = int(label.replace("k", ""))  # 提取数字部分
        if 1 <= k_val <= max_k:
            vector[k_val - 1] = count  # 1k对应索引0, 2k对应索引1...

    # 归一化处理
    return vector / vector.sum()


def get_word_freq_ranking(file_path):
    doc = nlp(Path(file_path).read_text())
    tokens = process_tokens(doc)
    tokens_df = pd.DataFrame({"word": tokens})
    word_counts = tokens_df["word"].value_counts().reset_index()
    word_counts.columns = ["word", "count"]  # 重命名列
    word_counts = word_counts.merge(word_freq_df, on="word")
    return (
        word_counts.groupby("label")
        .agg({"label": "count"})
        .sort_index(key=lambda x: x.str.replace("k", "").astype(int))
        .rename(columns={"label": "count"})
    )


In [None]:
books = Path("book_sentence").glob("*.txt")
vectors = []
for book in books:
    result = df_to_normalized_vector(get_word_freq_ranking(book))
    vectors.append(result)
    print(book.stem)

book_vocab_vectors = pd.DataFrame(
    data=dict(
        book=[i.stem for i in Path("book_sentence").glob("*.txt")], vector=vectors
    ),
    columns=pd.Index(["book", "vector"], dtype="object"),
)

book_vocab_vectors["vector"] = book_vocab_vectors["vector"].apply(lambda x: x.tolist())
book_vocab_vectors.to_csv("book_vocab_vectors.csv", index=False)

In [35]:
for book in Path("book_sentence").glob("*.txt"):
    doc = nlp(book.read_text())
    tokens_info = process_tokens_and_info(doc)
    df = pd.DataFrame(tokens_info, columns=pd.Index(["word", "pos"]))
    words = df["word"].unique()
    df.pivot_table(index="word", columns="pos", aggfunc="size").fillna(0).astype(
        int
    ).loc[words].to_csv(Path("book_vocab") / (book.stem + ".csv"))


In [None]:
df.pivot_table(index="word", columns="pos", aggfunc="size").fillna(0).astype(int).loc[
    "ability", "NOUN"
]

np.int64(3)