In [75]:
# 0. Environment
# pip install -q scikit-learn pandas numpy scipy

In [76]:
# from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import kendalltau
from scipy.sparse.csgraph import minimum_spanning_tree
from pathlib import Path
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse.csgraph import minimum_spanning_tree
import pandas as pd
import numpy as np
import re

# custom loader
from dao_original_text_punct import get_raw_chapters
raw_chapters = get_raw_chapters()
HERE = Path.cwd().parent                         
# HERE = Path(__file__).parent.resolve()      # 本地脚本
data_dir = HERE / 'data'
out_dir = HERE / 'out'
data_dir.mkdir(exist_ok=True) 
out_dir.mkdir(exist_ok=True) 

# 1. 提取段心 & 生成问句  
Extract core sentence & Yixi question

规则 Rules：  
1. 首句即「故/是以/夫唯」→ 直接采  
2. 非首句但以「故/是以/夫唯」开头 → 采  
3. 末句收束（排比/反问）→ 采  
4. 兜底首句

In [77]:
def get_core_sentence(chap: str) -> str:
    sents = [s.strip() + '。' for s in re.split(r'[。？]', chap) if s.strip()]
    # rule 1
    if re.match(r'^故|^是以|^夫唯', sents[0]):
        return sents[0]
    # rule 2
    for s in sents[1:]:
        if re.match(r'^故|^是以|^夫唯', s):
            return s
    # rule 3
    last = sents[-1]
    if re.search(r'正言若反|是谓|故', last) or '；' in last or '？' in last:
        return last
    # rule 4
    return sents[0]

chapters_punct = [c.strip() for c in raw_chapters]
core_sentences = [get_core_sentence(c) for c in chapters_punct]

In [78]:
# 生成 core-sentence TSV（ bilingual header ）
core_df = pd.DataFrame({'chapter_id': range(1, 82),
                        'core_sentence_zh': core_sentences})
core_df.to_csv(data_dir/'core_sentences_81.tsv', sep='\t', index=False)
core_df.head(3)

Unnamed: 0,chapter_id,core_sentence_zh
0,1,故常无欲，以观其妙；常有欲，以观其徼。
1,2,故有无相生，难易相成，长短相形，高下相倾，音声相和，前后相随。
2,3,是以圣人之治，虚其心，实其腹；弱其志，强其骨。


# 2. 手工尹喜问句  
Yixi questions (manually created)

In [79]:
# 读取已手工完成的 yao_question_bank_81.tsv
qa_df = pd.read_csv(data_dir/'yao_question_bank_81.tsv', sep='\t')
qa_df.head(3)

Unnamed: 0,chapter_id,core_sentence_zh,core_sentence_en,yixi_question_zh,yixi_question_en
0,1,故常无欲，以观其妙；常有欲，以观其徼。,"Hence always without desire, one observes its ...",敢问无欲何以观妙，有欲何以观徼？,Dare I ask: how does observing without desire ...
1,2,故有无相生，难易相成，长短相形，高下相倾，音声相和，前后相随。,Being and non-being create each other; difficu...,敢问有无何以相生？,Dare I ask: how do being and non-being give bi...
2,3,是以圣人之治，虚其心，实其腹；弱其志，强其骨。,Therefore the sage governs by emptying the min...,敢问虚心弱志何以成治？,Dare I ask: how does emptying the mind and wea...


# 3. 语义重排  
Semantic reordering (TF-IDF + cosine + MST)

In [80]:
def reorder_factory(text_series: pd.Series, label: str, out_dir: Path):
    """
    输入：81 章字符串 Series
    返回：dict 含索引、tau、相邻余弦 + 写入 csv
    """
    # 1. 向量化 → 相似度
    vec   = CountVectorizer(
        tokenizer=lambda s: re.findall(r'[\u4e00-\u9fff]', s),
        ngram_range=(1,2),
        min_df=1,
        binary=True)
    tfidf = vec.fit_transform(text_series)
    sim   = cosine_similarity(tfidf)

    # 2. MST → 拓扑序
    mst   = minimum_spanning_tree(1 - sim)
    order = np.argsort(mst.toarray().sum(axis=1))
    order = order[::-1]
    # 3. 指标
    tau  = pd.Series(np.arange(81)).corr(pd.Series(order), method='kendall')
    adj_cos = np.mean([sim[i, j] for i, j in zip(order, order[1:])])

    # 4. 写出索引 csv
    out_csv = out_dir / f'reorder_index_{label}.csv'
    pd.DataFrame({'chapter_id': np.arange(1, 82),
                  'new_order': order + 1}).to_csv(out_csv, index=False)

    return {'label': label, 'tau': tau, 'adj_cos': adj_cos,
            'index': order, 'sim_mat': sim}   # 如需后续画图可继续用

In [81]:
inputs = {
    'C': qa_df['core_sentence_zh'],
    'D': qa_df['yixi_question_zh'] + ' ' + qa_df['core_sentence_zh'],  # 段心+问句
    'E': pd.Series(chapters_punct),                       # 全文
    'F': qa_df['yixi_question_zh'] + ' ' + pd.Series(chapters_punct),  # 补问+全文
    # 未来加 G/H 只需在这里继续 append
}

results = []
for lbl, txt in inputs.items():
    res = reorder_factory(txt, lbl, data_dir)
    results.append(res)
    print(f"{lbl:>2}  τ = {res['tau']:.3f}   相邻余弦 = {res['adj_cos']:.4f}")

 C  τ = 0.231   相邻余弦 = 0.0925
 D  τ = 0.271   相邻余弦 = 0.2328
 E  τ = 0.180   相邻余弦 = 0.1050
 F  τ = 0.155   相邻余弦 = 0.1445




In [82]:
# 6 组「重排索引」字典
orders = {
    'A': np.arange(81),                                    # 1:1
    'B': np.arange(81),                                    # 1:1
    'C': pd.read_csv(data_dir / 'reorder_index_C.csv')['new_order'].values - 1, # 段心
    'D': pd.read_csv(data_dir / 'reorder_index_D.csv')['new_order'].values - 1,  # 段心+问句
    'E': pd.read_csv(data_dir / 'reorder_index_E.csv')['new_order'].values - 1,
    'F': pd.read_csv(data_dir / 'reorder_index_F.csv')['new_order'].values - 1,
}

# 输入文本
inputs_raw = {
    'A': chapters_punct,
    'B': [q + '\n' + chap  for chap, q in zip(chapters_punct, qa_df['yixi_question_zh'])], #补问+段心 
    'C': qa_df['core_sentence_zh'],                                    # 段心
    'D': qa_df['yixi_question_zh'] + ' ' + qa_df['core_sentence_zh'], # 段心+问句
    'E': pd.Series(chapters_punct),                                    # 全文
    'F': qa_df['yixi_question_zh'] + ' ' + pd.Series(chapters_punct),  # 全文+问句
}

for lbl in ['A','B','C','D','E','F']:
    # 1. 按索引重排
    reord_txt = [inputs_raw[lbl][i] for i in orders[lbl]]
    # 2. 写文件（带章节号，方便后续切分）
    blocks = [f'--- {i:02d} ---\n{chap}' for i, chap in enumerate(reord_txt, 1)]
    (out_dir / f'V{lbl}_final.txt').write_text('\n'.join(blocks), encoding='utf-8')
    
    # txt_save = open(out_dir / f'V{lbl}_final.txt').read().split('--- ')[1].split('\n', 1)[1].rstrip('\n')
    # txt_reord = inputs_raw[lbl][orders[lbl][0]]
    # print('保存首章 == 重排首章:', txt_save == txt_reord)

print('✅ 6 份重排终版已生成 → out/V*_final.txt')

✅ 6 份重排终版已生成 → out/V*_final.txt


# 4. 评估  
Evaluation: Kendall τ & character-level smoothness

In [None]:
def han_jaccard(chap_list):
    grams = [set(re.findall(r'[\u4e00-\u9fff]', ch)) for ch in chap_list]
    jacs = []
    for g1, g2 in zip(grams, grams[1:]):
        union = g1 | g2
        jacs.append(len(g1 & g2) / len(union) if union else 0.0)
    return jacs, np.mean(jacs)

def adjacent_cosine(chap_list, new_order=None):
    vec = CountVectorizer(
        tokenizer=lambda s: re.findall(r'[\u4e00-\u9fff]', s),
        ngram_range=(1,2),
        min_df=1,
        binary=True)
    tfidf = vec.fit_transform(chap_list)
    sim   = cosine_similarity(tfidf)
    if new_order is None:
        new_order = range(80)          # 原版顺序
    else:
        new_order = np.asarray(new_order)
    return np.mean([sim[i, j] for i, j in zip(new_order, new_order[1:])])

# 评估主循环
def evaluate_one(lbl, txt_raw, order):
    """返回平滑度 + 相邻余弦（与重排阶段完全一致）"""
    # 1. 按重排索引取字符串（顺序已对齐）
    reord_txt = [txt_raw[i] for i in order]
    # 2. 平滑度
    _, smooth = han_jaccard(reord_txt)
    # 3. 相邻余弦（用已对齐字符串）
    vec   = CountVectorizer(tokenizer=lambda s: re.findall(r'[\u4e00-\u9fff]', s),
                            ngram_range=(1,2), min_df=1, binary=True)
    tfidf = vec.fit_transform(reord_txt)
    sim   = cosine_similarity(tfidf)
    adj_cos = np.mean([sim[i, j] for i, j in zip(range(80), range(1,81))])
    return smooth, adj_cos

# 主循环（不再读文件，直接喂原始字符串 + 重排索引）
results = []
for lbl in ['A','B','C','D','E','F']:
    txt_raw = inputs_raw[lbl]          # 你前面已定义的原始字符串列表
    order   = orders[lbl]              # 你前面已算好的重排索引
    smooth, adj_cos = evaluate_one(lbl, txt_raw, order)
    results.append({'Group': lbl, 'Smooth': smooth, 'AdjCos': adj_cos})

eval_df = pd.DataFrame(results)
eval_df.to_csv(out_dir / 'evaluation_6groups_aligned.csv', index=False)
print(eval_df)

  Group    Smooth    AdjCos
0     A  0.130081  0.112247
1     B  0.173294  0.155404
2     C  0.090813  0.092548
3     D  0.229990  0.232792
4     E  0.123440  0.105024
5     F  0.161516  0.144489


---
Open in Colab: [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/YOUR_USER/ddj-reorder-jca/blob/main/bilingual_notebook.ipynb)