1. 分割pdf
2. 提取6000条文献

In [1]:
# -*- coding: utf-8 -*-
"""
锆合金数据抽取流水线（A→B→C）
- 输入：PDF 文件夹
- 依赖：你已有的 pdf_to_md(), llm_api()
- 输出：CSV/Excel（长表 + 分 domain 表 + 透视雏形）
"""
import os, json, re, csv, hashlib, argparse
from pathlib import Path
from collections import defaultdict
from typing import Any, Dict, List, Optional

# ========== 你已有的函数（在此处导入或粘贴真实实现） ==========
from pdf_to_md import pdf_to_md  # 你已经实现的：PDF 转 Markdown
from llm_api import llm_api  # 你已经实现的：调用 LLM API

# ========== 固定配置 ==========
DOMAINS = [
    "density",
    "specific_heat",
    "thermal_conductivity",
    "elastoplastic_model",
    "thermal_expansion",
    "irradiation_creep",
    "irradiation_swelling",
    "corrosion",
    "hardening",
]

# 简单单位规范（可按需扩展）
UNIT_MAP = {
    "mpa": "MPa",
    "gpa": "GPa",
    "hv": "HV",
    "°c": "°C", "c": "°C",
    "w/m·k": "W/mK", "w/mk": "W/mK",
    "j/kg·k": "J/kgK", "j/kgk": "J/kgK",
    "1/k": "1/K",
    "ppm": "ppm",
    "wt%": "wt%",
    "at%": "at%",
    "dpa": "dpa",
    "mm/y": "mm/y", "mmpy": "mm/y",
}

def normalize_unit(u: Optional[str]) -> Optional[str]:
    if not u: return u
    key = u.strip().lower().replace(" ", "")
    return UNIT_MAP.get(key, u)

# ========== 提示词模板 ==========
PROMPT_A = """\
你是核材料信息抽取专家。输入是一篇锆合金文献的 Markdown。
任务：列出文中关于样品/状态/条件/属性的所有“原子事实”（即使信息不完整也要列）。
输出必须是 JSON 数组，每个对象包含字段：
source_id, page_or_fig, evidence_span, alloy_name, composition_raw,
specimen_state, process_step, test_type, property_name,
value, value_min, value_max, unit,
conditions (JSON: temp_C, medium, pressure_MPa, strain_rate_s-1, dpa, fluence, time_h, atmosphere),
metric_type, confidence

规则：
- 并列温度/介质/多样品/多曲线全部拆分为多条；
- 没数值但有明确结论（如“显著提高/未观察到相变”）也生成记录，metric_type="judgement"；
- 表格、图注、附录、方法学的细节都要抓；
- 不得虚构中间点，不插值。

【文献 Markdown】
SOURCE_ID: {source_id}
---
{md}
"""

PROMPT_B = """\
输入：上一轮(JSON)；输出：同为 JSON 数组，但每条记录需：
1) 增加 domain ∈ {density, specific_heat, thermal_conductivity, elastoplastic_model, thermal_expansion, irradiation_creep, irradiation_swelling, corrosion, hardening}，据 property_name/test_type/上下文合理映射；
2) 统一单位（MPa, HV, wt%, ppm, °C/K, W/mK, J/kgK, 1/K, dpa, mm/y），保留见到的原始单位到 raw_unit；
3) composition_raw 解析为 composition(JSON: 元素-数值-单位)，如果未知则可为空；
4) 若仅给出模型/参数名（弹塑性模型/硬化参数等），也要归入对应 domain，value 可为 null；
5) 补充 conditions 字段的缺失键（无则为 null）。

仅输出 JSON 数组，勿夹杂其它文本。
"""

# ========== 工具函数 ==========
def safe_json_loads(text: str) -> List[Dict[str, Any]]:
    """尝试从 LLM 文本中提取 JSON 数组。"""
    # 优先直接解析
    try:
        obj = json.loads(text)
        return obj if isinstance(obj, list) else []
    except Exception:
        pass
    # 退路：截取第一个 [ ... ] 段
    m = re.search(r"\[.*\]", text, flags=re.S)
    if m:
        try:
            obj = json.loads(m.group(0))
            return obj if isinstance(obj, list) else []
        except Exception:
            return []
    return []

def md5_hash(s: str) -> str:
    return hashlib.md5(s.encode("utf-8")).hexdigest()[:12]

def conditions_hash(conditions: Optional[Dict[str, Any]]) -> str:
    if conditions is None: conditions = {}
    # 排序后稳定序列化
    data = {k: conditions.get(k) for k in sorted(conditions.keys())}
    return md5_hash(json.dumps(data, ensure_ascii=False, sort_keys=True))

def normalize_row(r: Dict[str, Any]) -> Dict[str, Any]:
    r = dict(r)
    # 统一单位
    if "unit" in r:
        r["raw_unit"] = r.get("raw_unit") or r.get("unit")
        r["unit"] = normalize_unit(r.get("unit"))
    # 统一字段存在性
    r.setdefault("domain", None)
    r.setdefault("composition", None)
    r.setdefault("conditions", None)
    # 生成 conditions_hash
    r["conditions_hash"] = r.get("conditions_hash") or conditions_hash(r.get("conditions"))
    return r

def explode_arrays(rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    将数组维度（如温度/介质/样品态/属性名）展开成多条（笛卡尔积）。
    为简化，这里对常见字段做一层“平行展开”，遇到列表就多条复制。
    """
    keys_may_list = [
        ("conditions", "temp_C"),
        ("conditions", "medium"),
        ("conditions", "pressure_MPa"),
        ("conditions", "strain_rate_s-1"),
        ("conditions", "dpa"),
        ("conditions", "fluence"),
        ("conditions", "time_h"),
        ("conditions", "atmosphere"),
    ]
    out = []
    for r in rows:
        # 收集列表维度
        list_axes: List[List[Dict[str, Any]]] = []
        base = normalize_row(r)
        # 构建每个轴的候选
        axes: List[List[Dict[str, Any]]] = []
        for (root, key) in keys_may_list:
            val = (base.get(root) or {}).get(key) if base.get(root) else None
            if isinstance(val, list) and val:
                candidates = []
                for v in val:
                    nr = json.loads(json.dumps(base))
                    nr[root][key] = v
                    candidates.append(nr)
                axes.append(candidates)
        # 若没有列表字段，直接加入
        if not axes:
            out.append(base); continue
        # 有列表轴：做笛卡尔积
        from itertools import product
        for combo in product(*axes):
            # combo 是多个“局部拷贝”，需要合并到一个（后者覆盖前者影响）
            merged = json.loads(json.dumps(base))
            for nr in combo:
                merged = nr  # 已经是覆盖后的副本
            # 重新计算 hash
            merged["conditions_hash"] = conditions_hash(merged.get("conditions"))
            out.append(merged)
    return out or rows

def ensure_nine_domains(rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    对 (source_id, specimen_state, conditions_hash) 分组，强制九类占位补齐
    """
    rows = [normalize_row(r) for r in rows]
    groups = defaultdict(list)
    for r in rows:
        key = (r.get("source_id"), r.get("specimen_state"), r.get("conditions_hash"))
        groups[key].append(r)
    final = []
    for key, grp in groups.items():
        present = { (g.get("domain") or "").strip().lower() for g in grp }
        missing = [d for d in DOMAINS if d not in present]
        final.extend(grp)
        if missing:
            # 以该组第一条作“上下文引用”
            ctx = grp[0]
            for d in missing:
                final.append({
                    "source_id": ctx.get("source_id"),
                    "page_or_fig": ctx.get("page_or_fig") or "global_or_methods",
                    "evidence_span": "not_mentioned_in_text",
                    "alloy_name": ctx.get("alloy_name"),
                    "composition": ctx.get("composition"),
                    "specimen_state": ctx.get("specimen_state"),
                    "process_step": ctx.get("process_step"),
                    "test_type": None,
                    "domain": d,
                    "property_name": None,
                    "value": None, "value_min": None, "value_max": None, "unit": None,
                    "raw_unit": None,
                    "conditions": ctx.get("conditions"),
                    "conditions_hash": ctx.get("conditions_hash"),
                    "metric_type": "not_available",
                    "confidence": None,
                    "note": "auto_filled_domain_placeholder"
                })
    return final

def dedupe(rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    去重键：[source_id, specimen_state, domain, property_name, conditions_hash, value/value_min/value_max/unit]
    （含值，避免把不同数值误删）
    """
    seen = set()
    out = []
    for r in rows:
        key = json.dumps([
            r.get("source_id"),
            r.get("specimen_state"),
            (r.get("domain") or "").strip().lower(),
            (r.get("property_name") or "").strip().lower(),
            r.get("conditions_hash"),
            r.get("value"), r.get("value_min"), r.get("value_max"),
            r.get("unit")
        ], ensure_ascii=False)
        if key not in seen:
            seen.add(key)
            out.append(r)
    return out

def write_csv(path: Path, rows: List[Dict[str, Any]]):
    if not rows:
        return
    keys = sorted(set().union(*[r.keys() for r in rows]))
    with path.open("w", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=keys)
        w.writeheader()
        for r in rows:
            w.writerow({k: r.get(k) for k in keys})

# ========== 调度：单篇文献 ==========
def process_single_pdf(pdf_path: Path, out_dir: Path):
    out_dir.mkdir(parents=True, exist_ok=True)
    source_id = pdf_path.stem  # 也可以替换成 DOI/内部ID
    
    # 1) PDF → MD
    md = pdf_to_md(str(pdf_path))
    
    # 2) LLM A：高召回
    a_prompt = PROMPT_A.format(source_id=source_id, md=md)
    a_text = llm_api(a_prompt)
    a_json = safe_json_loads(a_text)
    
    # 3) LLM B：标准化+九类映射
    b_prompt = PROMPT_B + "\n\n【上一轮输出】\n" + json.dumps(a_json, ensure_ascii=False)
    b_text = llm_api(b_prompt)
    b_json = safe_json_loads(b_text)
    
    # 4) C：代码内完成：爆裂 → 九类占位 → 去重
    c1 = explode_arrays(b_json)
    c2 = ensure_nine_domains(c1)
    c3 = dedupe(c2)
    
    # 5) 导出
    long_path = out_dir / f"{source_id}__long.csv"
    write_csv(long_path, c3)

    # 6) 按 domain 切表
    by_domain = defaultdict(list)
    for r in c3:
        by_domain[(r.get('domain') or 'unknown')].append(r)
    for d, rows in by_domain.items():
        write_csv(out_dir / f"{source_id}__{d}.csv", rows)

    # 7) 透视雏形（可选）：每组保留同 domain 的第一个值（演示用途）
    pivot_rows = []
    group_key = lambda r: (r.get("source_id"), r.get("specimen_state"), r.get("conditions_hash"))
    grouped = defaultdict(list)
    for r in c3:
        grouped[group_key(r)].append(r)
    for key, grp in grouped.items():
        row = {
            "source_id": key[0],
            "specimen_state": key[1],
            "conditions_hash": key[2],
        }
        # 为每个 domain 取第一个非空 value
        for d in DOMAINS:
            vals = [g for g in grp if (g.get("domain") == d and g.get("value") is not None)]
            row[d] = vals[0]["value"] if vals else None
        pivot_rows.append(row)
    write_csv(out_dir / f"{source_id}__pivot_demo.csv", pivot_rows)

    return {
        "counts": {
            "a_raw": len(a_json),
            "b_norm": len(b_json),
            "c_long": len(c3)
        },
        "paths": {
            "long": str(long_path),
            "pivot_demo": str(out_dir / f"{source_id}__pivot_demo.csv")
        }
    }

# ========== 调度：批量 ==========
def process_folder(pdf_dir: Path, out_dir: Path):
    pdfs = sorted([p for p in pdf_dir.glob("*.pdf")])
    summary = []
    for p in pdfs:
        try:
            info = process_single_pdf(p, out_dir)
            print(f"✅ {p.name}: long={info['counts']['c_long']} (A={info['counts']['a_raw']}, B={info['counts']['b_norm']})")
            summary.append({
                "pdf": p.name,
                **info["counts"],
                **info["paths"]
            })
        except Exception as e:
            print(f"❌ {p.name}: {e}")
    write_csv(out_dir / "_summary.csv", summary)

# # ========== CLI ==========
# if __name__ == "__main__":
#     ap = argparse.ArgumentParser()
#     ap.add_argument("--pdf_dir", type=str, required=True, help="包含 PDF 的文件夹")
#     ap.add_argument("--out_dir", type=str, required=True, help="输出文件夹")
#     args = ap.parse_args()
#     process_folder(Path(args.pdf_dir), Path(args.out_dir))


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
b_json_str = """\
[
  {
    "source_id": "核材料文档3",
    "page_or_fig": "Table 1",
    "evidence_span": "Zircaloy-2 Cold-rolled~ Batch A . . . 50% after annealing Sn 1.46 Fe 0.13 Cr 0.09 Ni 0.06 O 0.086",
    "alloy_name": "Zircaloy-2",
    "composition_raw": "Sn:1.46, Fe:0.13, Cr:0.09, Ni:0.06, O:0.086",
    "specimen_state": null,
    "process_step": "annealing at 800°C followed by cold-rolling 50%",
    "test_type": null,
    "property_name": "composition",
    "value": null,
    "value_min": null,
    "value_max": null,
    "unit": "wt%",
    "raw_unit": "wt%",
    "conditions": {"temp_C": null, "temp_K": null},
    "metric_type": "measurement",
    "confidence": 0.9,
    "domain": "corrosion",
    "composition": [
      {"element": "Sn", "value": 1.46, "unit": "wt%"},
      {"element": "Fe", "value": 0.13, "unit": "wt%"},
      {"element": "Cr", "value": 0.09, "unit": "wt%"},
      {"element": "Ni", "value": 0.06, "unit": "wt%"},
      {"element": "O", "value": 0.086, "unit": "wt%"}
    ]
  },
  {
    "source_id": "核材料文档3",
    "page_or_fig": "Table 1",
    "evidence_span": "Zircaloy-2 at 800 C Cold-rolled~ Batch B. . . 40% after annealing at 760 C Sn 1.39 Fe 0.14 Cr 0.10 Ni 0.05 O 0.101",
    "alloy_name": "Zircaloy-2",
    "composition_raw": "Sn:1.39, Fe:0.14, Cr:0.10, Ni:0.05, O:0.101",
    "specimen_state": null,
    "process_step": "annealing at 760°C followed by cold-rolling 40%",
    "test_type": null,
    "property_name": "composition",
    "value": null,
    "value_min": null,
    "value_max": null,
    "unit": "wt%",
    "raw_unit": "wt%",
    "conditions": {"temp_C": null, "temp_K": null},
    "metric_type": "measurement",
    "confidence": 0.9,
    "domain": "corrosion",
    "composition": [
      {"element": "Sn", "value": 1.39, "unit": "wt%"},
      {"element": "Fe", "value": 0.14, "unit": "wt%"},
      {"element": "Cr", "value": 0.10, "unit": "wt%"},
      {"element": "Ni", "value": 0.05, "unit": "wt%"},
      {"element": "O", "value": 0.101, "unit": "wt%"}
    ]
  },
  {
    "source_id": "核材料文档3",
    "page_or_fig": "Table 2",
    "evidence_span": "Annealed Longitudinal Hardness 158 VHN for Material A",
    "alloy_name": "Zircaloy-2",
    "composition_raw": null,
    "specimen_state": "Annealed",
    "process_step": null,
    "test_type": "tensile test",
    "property_name": "hardness",
    "value": 158,
    "value_min": null,
    "value_max": null,
    "unit": "HV",
    "raw_unit": "VHN",
    "conditions": {"temp_C": 20, "temp_K": 293.15},
    "metric_type": "measurement",
    "confidence": 0.9,
    "domain": "hardening",
    "composition": null
  },
  {
    "source_id": "核材料文档3",
    "page_or_fig": "Table 2",
    "evidence_span": "Annealed Longitudinal 0.2% Y.S. 49.0 kpsi for Material A",
    "alloy_name": "Zircaloy-2",
    "composition_raw": null,
    "specimen_state": "Annealed",
    "process_step": null,
    "test_type": "tensile test",
    "property_name": "yield strength",
    "value": 337.84,
    "value_min": null,
    "value_max": null,
    "unit": "MPa",
    "raw_unit": "kpsi",
    "conditions": {"temp_C": 20, "temp_K": 293.15},
    "metric_type": "measurement",
    "confidence": 0.9,
    "domain": "hardening",
    "composition": null
  },
  {
    "source_id": "核材料文档3",
    "page_or_fig": "Table 2",
    "evidence_span": "Annealed Longitudinal U.T.S. 65.0 kpsi for Material A",
    "alloy_name": "Zircaloy-2",
    "composition_raw": null,
    "specimen_state": "Annealed",
    "process_step": null,
    "test_type": "tensile test",
    "property_name": "ultimate tensile strength",
    "value": 448.16,
    "value_min": null,
    "value_max": null,
    "unit": "MPa",
    "raw_unit": "kpsi",
    "conditions": {"temp_C": 20, "temp_K": 293.15},
    "metric_type": "measurement",
    "confidence": 0.9,
    "domain": "hardening",
    "composition": null
  },
  {
    "source_id": "核材料文档3",
    "page_or_fig": "Table 2",
    "evidence_span": "Annealed Longitudinal Un.El. 10.0% for Material A",
    "alloy_name": "Zircaloy-2",
    "composition_raw": null,
    "specimen_state": "Annealed",
    "process_step": null,
    "test_type": "tensile test",
    "property_name": "elongation",
    "value": 10.0,
    "value_min": null,
    "value_max": null,
    "unit": "%",
    "raw_unit": "%",
    "conditions": {"temp_C": 20, "temp_K": 293.15},
    "metric_type": "measurement",
    "confidence": 0.9,
    "domain": "elastoplastic_model",
    "composition": null
  },
  {
    "source_id": "核材料文档3",
    "page_or_fig": "Table 2",
    "evidence_span": "Annealed Short-transverse Hardness 192 VHN for Material A",
    "alloy_name": "Zircaloy-2",
    "composition_raw": null,
    "specimen_state": "Annealed",
    "process_step": null,
    "test_type": "tensile test",
    "property_name": "hardness",
    "value": 192,
    "value_min": null,
    "value_max": null,
    "unit": "HV",
    "raw_unit": "VHN",
    "conditions": {"temp_C": 20, "temp_K": 293.15},
    "metric_type": "measurement",
    "confidence": 0.9,
    "domain": "hardening",
    "composition": null
  },
  {
    "source_id": "核材料文档3",
    "page_or_fig": "Table 2",
    "evidence_span": "Annealed Short-transverse 0.2% Y.S. 73.0 kpsi for Material A",
    "alloy_name": "Zircaloy-2",
    "composition_raw": null,
    "specimen_state": "Annealed",
    "process_step": null,
    "test_type": "tensile test",
    "property_name": "yield strength",
    "value": 503.32,
    "value_min": null,
    "value_max": null,
    "unit": "MPa",
    "raw_unit": "kpsi",
    "conditions": {"temp_C": 20, "temp_K": 293.15},
    "metric_type": "measurement",
    "confidence": 0.9,
    "domain": "hardening",
    "composition": null
  },
  {
    "source_id": "核材料文档3",
    "page_or_fig": "Table 2",
    "evidence_span": "Annealed Short-transverse U.T.S. 76.0 kpsi for Material A",
    "alloy_name": "Zircaloy-2",
    "composition_raw": null,
    "specimen_state": "Annealed",
    "process_step": null,
    "test_type": "tensile test",
    "property_name": "ultimate tensile strength",
    "value": 524.00,
    "value_min": null,
    "value_max": null,
    "unit": "MPa",
    "raw_unit": "kpsi",
    "conditions": {"temp_C": 20, "temp_K": 293.15},
    "metric_type": "measurement",
    "confidence": 0.9,
    "domain": "hardening",
    "composition": null
  },
  {
    "source_id": "核材料文档3",
    "page_or_fig": "Table 2",
    "evidence_span": "Annealed Short-transverse Un.El. 8.0% for Material A",
    "alloy_name": "Zircaloy-2",
    "composition_raw": null,
    "specimen_state": "Annealed",
    "process_step": null,
    "test_type": "tensile test",
    "property_name": "elongation",
    "value": 8.0,
    "value_min": null,
    "value_max": null,
    "unit": "%",
    "raw_unit": "%",
    "conditions": {"temp_C": 20, "temp_K": 293.15},
    "metric_type": "measurement",
    "confidence": 0.9,
    "domain": "elastoplastic_model",
    "composition": null
  },
  {
    "source_id": "核材料文档3",
    "page_or_fig": "Table 2",
    "evidence_span": "Cold-worked Longitudinal Hardness 201 VHN for Material A",
    "alloy_name": "Zircaloy-2",
    "composition_raw": null,
    "specimen_state": "Cold-worked",
    "process_step": null,
    "test_type": "tensile test",
    "property_name": "hardness",
    "value": 201,
    "value_min": null,
    "value_max": null,
    "unit": "HV",
    "raw_unit": "VHN",
    "conditions": {"temp_C": 20, "temp_K": 293.15},
    "metric_type": "measurement",
    "confidence": 0.9,
    "domain": "hardening",
    "composition": null
  },
  {
    "source_id": "核材料文档3",
    "page_or_fig": "Table 2",
    "evidence_span": "Cold-worked Longitudinal 0.2% Y.S. 89.5 kpsi for Material A",
    "alloy_name": "Zircaloy-2",
    "composition_raw": null,
    "specimen_state": "Cold-worked",
    "process_step": null,
    "test_type": "tensile test",
    "property_name": "yield strength",
    "value": 617.08,
    "value_min": null,
    "value_max": null,
    "unit": "MPa",
    "raw_unit": "kpsi",
    "conditions": {"temp_C": 20, "temp_K": 293.15},
    "metric_type": "measurement",
    "confidence": 0.9,
    "domain": "hardening",
    "composition": null
  },
  {
    "source_id": "核材料文档3",
    "page_or_fig": "Table 2",
    "evidence_span": "Cold-worked Longitudinal U.T.S. 92.0 kpsi for Material A",
    "alloy_name": "Zircaloy-2",
    "composition_raw": null,
    "specimen_state": "Cold-worked",
    "process_step": null,
    "test_type": "tensile test",
    "property_name": "ultimate tensile strength",
    "value": 634.32,
    "value_min": null,
    "value_max": null,
    "unit": "MPa",
    "raw_unit": "kpsi",
    "conditions": {"temp_C": 20, "temp_K": 293.15},
    "metric_type": "measurement",
    "confidence": 0.9,
    "domain": "hardening",
    "composition": null
  },
  {
    "source_id": "核材料文档3",
    "page_or_fig": "Table 2",
    "evidence_span": "Cold-worked Longitudinal Un.El. 2.5% for Material A",
    "alloy_name": "Zircaloy-2",
    "composition_raw": null,
    "specimen_state": "Cold-worked",
    "process_step": null,
    "test_type": "tensile test",
    "property_name": "elongation",
    "value": 2.5,
    "value_min": null,
    "value_max": null,
    "unit": "%",
    "raw_unit": "%",
    "conditions": {"temp_C": 20, "temp_K": 293.15},
    "metric_type": "measurement",
    "confidence": 0.9,
    "domain": "elastoplastic_model",
    "composition": null
  },
  {
    "source_id": "核材料文档3",
    "page_or_fig": "Table 2",
    "evidence_span": "Cold-worked Short-transverse Hardness 236 VHN for Material A",
    "alloy_name": "Zircaloy-2",
    "composition_raw": null,
    "specimen_state": "Cold-worked",
    "process_step": null,
    "test_type": "tensile test",
    "property_name": "hardness",
    "value": 236,
    "value_min": null,
    "value_max": null,
    "unit": "HV",
    "raw_unit": "VHN",
    "conditions": {"temp_C": 20, "temp_K": 293.15},
    "metric_type": "measurement",
    "confidence": 0.9,
    "domain": "hardening",
    "composition": null
  },
  {
    "source_id": "核材料文档3",
    "page_or_fig": "Table 2",
    "evidence_span": "Cold-worked Short-transverse 0.2% Y.S. 97.0 kpsi for Material A",
    "alloy_name": "Zircaloy-2",
    "composition_raw": null,
    "specimen_state": "Cold-worked",
    "process_step": null,
    "test_type": "tensile test",
    "property_name": "yield strength",
    "value": 668.79,
    "value_min": null,
    "value_max": null,
    "unit": "MPa",
    "raw_unit": "kpsi",
    "conditions": {"temp_C": 20, "temp_K": 293.15},
    "metric_type": "measurement",
    "confidence": 0.9,
    "domain": "hardening",
    "composition": null
  },
  {
    "source_id": "核材料文档3",
    "page_or_fig": "Table 2",
    "evidence_span": "Cold-worked Short-transverse U.T.S. 100.0 kpsi for Material A",
    "alloy_name": "Zircaloy-2",
    "composition_raw": null,
    "specimen_state": "Cold-worked",
    "process_step": null,
    "test_type": "tensile test",
    "property_name": "ultimate tensile strength",
    "value": 689.48,
    "value_min": null,
    "value_max": null,
    "unit": "MPa",
    "raw_unit": "kpsi",
    "conditions": {"temp_C": 20, "temp_K": 293.15},
    "metric_type": "measurement",
    "confidence": 0.9,
    "domain": "hardening",
    "composition": null
  },
  {
    "source_id": "核材料文档极简3",
    "page_or_fig": "Table 2",
    "evidence_span": "Cold-worked Short-transverse Un.El. 2.0% for Material A",
    "alloy_name": "Zircaloy-2",
    "composition_raw": null,
    "specimen_state": "Cold-worked",
    "process_step": null,
    "test_type": "tensile test",
    "property_name": "elongation",
    "value": 2.0,
    "value_min": null,
    "value_max": null,
    "unit": "%",
    "raw_unit": "%",
    "conditions": {"temp_C": 20, "temp_K": 293.15},
    "metric_type": "measurement",
    "confidence": 0.9,
    "domain": "elastoplastic_model",
    "composition": null
  },
  {
    "source_id": "核材料文档3",
    "page_or_fig": "Table 2",
    "evidence_span": "Annealed Longitudinal Hardness 156 VHN for Material B",
    "alloy_name": "Zircaloy-2",
    "composition_raw": null,
    "specimen_state": "Annealed",
    "process_step": null,
    "test_type": "tensile test",
    "property_name": "hardness",
    "value": 156,
    "value_min": null,
    "value_max": null,
    "unit": "HV",
    "raw_unit": "VHN",
    "conditions": {"temp_C": 20, "temp_K": 293.15},
    "metric_type": "measurement",
    "confidence": 0.9,
    "domain": "hardening",
    "composition": null
  },
  {
    "source_id": "核材料文档3",
    "page_or_fig": "Table 2",
    "evidence_span": "Annealed Longitudinal 0.2% Y.S. 49.8 kpsi for Material B",
    "alloy_name": "Zircaloy-2",
    "composition_raw": null,
    "specimen_state": "Annealed",
    "process_step": null,
    "test_type": "tensile test",
    "property_name": "yield strength",
    "value": 343.36,
    "value_min": null,
    "value_max": null,
    "unit": "MPa",
    "raw_unit": "kpsi",
    "conditions": {"temp_C": 20, "temp_K": 293.15},
    "metric_type": "measurement",
    "confidence": 0.9,
    "domain": "hardening",
    "composition": null
  },
  {
    "source_id": "核材料文档3",
    "page_or_fig": "Table 2",
    "evidence_span": "Annealed Longitudinal U.T.S. 69.7 kpsi for Material B",
    "alloy_name": "Zircaloy-2",
    "composition_raw": null,
    "specimen_state": "Annealed",
    "process_step": null,
    "test_type": "tensile test",
    "property_name": "ultimate tensile strength",
    "value": 480.56,
    "value_min": null,
    "value_max": null,
    "unit": "MPa",
    "raw_unit": "kpsi",
    "conditions": {"temp_C": 20, "temp_K": 293.15},
    "metric_type": "measurement",
    "confidence": 0.9,
    "domain": "hardening",
    "composition": null
  },
  {
    "source_id": "核材料文档3",
    "page_or_fig": "Table 2",
    "evidence_span": "Annealed Longitudinal Un.El. 13.5% for Material B",
    "alloy_name": "Zircaloy-2",
    "composition_raw": null,
    "specimen_state": "Annealed",
    "process_step": null,
    "test_type": "tensile test",
    "property_name": "elongation",
    "value": 13.5,
    "value_min": null,
    "value_max": null,
    "unit": "%",
    "raw_unit": "%",
    "conditions": {"temp_C": 20, "temp_K": 293.15},
    "metric_type": "measurement",
    "confidence": 0.9,
    "domain": "elastoplastic_model",
    "composition": null
  },
  {
    "source_id": "核材料文档3",
    "page_or_fig": "Table 2",
    "evidence_span": "Annealed Short-transverse Hardness 193 VHN for Material B",
    "alloy_name": "Zircaloy-2",
    "composition_raw": null,
    "specimen_state": "Annealed",
    "process_step": null,
    "test_type": "tensile test",
    "property_name": "hardness",
    "value": 193,
    "value_min": null,
    "value_max": null,
    "unit": "HV",
    "raw_unit": "VHN",
    "conditions": {"temp_C": 20, "temp_K": 293.15},
    "metric_type": "measurement",
    "confidence": 0.9,
    "domain": "hardening",
    "composition": null
  },
  {
    "source_id": "核材料文档3",
    "page_or_fig": "Table 2",
    "evidence_span": "Annealed Short-transverse 0.2% Y.S. 69.7 kpsi for Material B",
    "alloy_name": "Zircaloy-2",
    "composition_raw": null,
    "specimen_state": "Annealed",
    "process_step": null,
    "test_type": "tensile test",
    "property_name": "yield strength",
    "value": 480.56,
    "value_min": null,
    "value_max": null,
    "unit": "MPa",
    "raw_unit": "kpsi",
    "conditions": {"temp_C": 20, "temp_K": 293.15},
    "metric_type": "measurement",
    "confidence": 0.9,
    "domain": "hardening",
    "composition": null
  },
  {
    "source_id": "核材料文档3",
    "page_or_fig": "Table 2",
    "evidence_span": "Annealed Short-transverse U.T.S. 75.4 kpsi for Material B",
    "alloy_name": "Zircaloy-2",
    "composition_raw": null,
    "specimen_state": "Annealed",
    "process_step": null,
    "test_type": "tensile test",
    "property_name": "ultimate tensile strength",
    "value": 519.82,
    "value_min": null,
    "value_max": null,
    "unit": "MPa",
    "raw_unit": "kpsi",
    "conditions": {"temp_C": 20, "temp_K": 293.15},
    "metric_type": "measurement",
    "confidence": 0.9,
    "domain": "hardening",
    "composition": null
  },
  {
    "source_id": "核材料文档3",
    "page_or_fig": "Table 2",
    "evidence_span": "Annealed Short-transverse Un.El. 8.5% for Material B",
    "alloy_name": "Zircaloy-2",
    "composition_raw": null,
    "specimen_state": "Annealed",
    "process_step": null,
    "test_type": "tensile test",
    "property_name": "elongation",
    "value": 8.5,
    "value_min": null,
    "value_max": null,
    "unit": "%",
    "raw_unit": "%",
    "conditions": {"temp_C": 20, "temp_K": 293.15},
    "metric_type": "measurement",
    "confidence": 0.9,
    "domain": "elastoplastic_model",
    "composition": null
  },
  {
    "source_id": "核材料文档3",
    "page_or_fig": "Table 2",
    "evidence_span": "Cold-worked Longitudinal Hardness 193 VHN for Material B",
    "alloy_name": "Zircaloy-2",
    "composition_raw": null,
    "specimen_state": "Cold-worked",
    "process_step": null,
    "test_type": "tensile test",
    "property_name": "hardness",
    "value": 193,
    "value_min": null,
    "value_max": null,
    "unit": "HV",
    "raw_unit": "VHN",
    "conditions": {"temp_C": 20, "temp_K": 293.15},
    "metric_type": "measurement",
    "confidence": 0.9,
    "domain": "hardening",
    "composition": null
  },
  {
    "source_id": "核材料文档3",
    "page_or_fig": "Table 2",
    "evidence_span": "Cold-worked Longitudinal 0.2% Y.S. 81.1 kpsi for Material B",
    "alloy_name": "Zircaloy-2",
    "composition_raw": null,
    "specimen_state": "Cold-worked",
    "process_step": null,
    "test_type": "tensile test",
    "property_name": "yield strength",
    "value": 559.08,
    "value_min": null,
    "value_max": null,
    "unit": "MPa",
    "raw_unit": "kpsi",
    "conditions": {"temp_C": 20, "temp_K": 293.15},
    "metric_type": "measurement",
    "confidence": 0.9,
    "domain": "hardening",
    "composition": null
  },
  {
    "source_id": "核材料文档3",
    "page_or_fig": "Table 2",
    "evidence_span": "Cold-worked Longitudinal U.T.S. 92.5 kpsi for Material B",
    "alloy_name": "Zircaloy-2",
    "composition_raw": null,
    "specimen_state": "Cold-worked",
    "process_step": null,
    "test_type": "tensile test",
    "property_name": "ultimate tensile strength",
    "value": 637.77,
    "value_min": null,
    "value_max": null,
    "unit": "MPa",
    "raw_unit": "kpsi",
    "conditions": {"temp_C": 20, "temp_K": 293.15},
    "metric_type": "measurement",
    "confidence": 0.9,
    "domain": "hardening",
    "composition": null
  },
  {
    "source_id": "核材料文档3",
    "page_or_fig": "Table 极简2",
    "evidence_span": "Cold-worked Longitudinal Un.El. 2.5% for Material B",
    "alloy_name": "Zircaloy-2",
    "composition_raw": null,
    "specimen_state": "Cold-worked",
    "process_step": null,
    "test_type": "tensile test",
    "property_name": "elongation",
    "value": 2.5,
    "value_min": null,
    "value_max": null,
    "unit": "%",
    "raw_unit": "%",
    "conditions": {"temp_C": 20, "temp_K": 293.15},
    "metric_type": "measurement",
    "confidence": 0.9,
    "domain": "elastoplastic_model",
    "composition": null
  },
  {
    "source_id": "核材料文档3",
    "page_or_fig": "Table 2",
    "evidence_span": "Cold-worked Short-transverse Hardness 226 VHN for Material B",
    "alloy_name": "Zircaloy-2",
    "composition_raw": null,
    "specimen_state": "Cold-worked",
    "process_step": null,
    "test_type": "tensile test",
    "property_name": "hardness",
    "value": 226,
    "value_min": null,
    "value_max": null,
    "unit": "HV",
    "raw_unit": "VHN",
    "conditions": {"temp_C": 20, "temp_K": 293.15},
    "metric_type": "measurement",
    "confidence": 0.9,
    "domain": "hardening",
    "composition": null
  },
  {
    "source_id": "核材料文档3",
    "page_or_fig": "Table 2",
    "evidence_span": "Cold-worked Short-transverse 0.2% Y.S. 85.3 kpsi for Material B",
    "alloy_name": "Zircaloy-2",
    "composition_raw": null,
    "specimen_state": "Cold-worked",
    "process_step": null,
    "test_type": "tensile test",
    "property_name": "yield strength",
    "value": 588.08,
    "value_min": null,
    "value_max": null,
    "unit": "MPa",
    "raw_unit": "极简kpsi",
    "conditions": {"temp_C": 20, "temp_K": 293.15},
    "metric_type": "measurement",
    "confidence": 0.9,
    "domain": "hardening",
    "极简composition": null
  },
  {
    "source_id": "核材料文档3",
    "page_or_fig": "Table 2",
    "evidence_span": "Cold-worked Short-transverse U.T.S. 96.7 kpsi for Material B",
    "alloy_name": "Zircaloy-2",
    "composition_raw": null,
    "specimen_state": "Cold-worked",
    "process_step": null,
    "极简test_type": "tensile test",
    "property_name": "ultimate tensile strength",
    "value": 666.68,
    "value_min": null,
    "value_max": null,
    "unit": "MPa",
    "raw_unit": "kpsi",
    "conditions": {"temp_C": 20, "temp_K": 293.15},
    "metric_type": "measurement",
    "confidence": 0.9,
    "domain": "hardening",
    "composition": null
  },
  {
    "source_id": "核材料文档3",
    "page_or_fig": "Table 2",
    "evidence_span": "Cold-worked Short-transverse Un.El. 1.5% for Material B",
    "alloy_name": "Zircaloy-2",
    "composition_raw": null,
    "specimen_state": "Cold-worked",
    "process_step": null,
    "test_type": "tensile test",
    "property_name": "elongation",
    "value": 1.5,
    "value_min": null,
    "value_max": null,
    "unit": "%",
    "raw_unit": "%",
    "conditions": {"极简temp_C": 20, "temp_K": 293.15},
    "metric_type": "measurement",
    "confidence": 0.9,
    "domain": "elastoplastic_model",
    "composition": null
  },
  {
    "source_id": "核材料文档3",
    "page_or_fig": "Experimental",
    "evidence_span": "The average grain size of the annealed specimens was 20 microns.",
    "alloy_name": "Zircaloy-2",
    "composition_raw": null,
    "specimen_state": "Annealed",
    "process_step": null,
    "test_type": null,
    "property_name": "grain size",
    "value": 20,
    "value_min": null,
    "value_max": null,
    "unit": "microns",
    "raw_unit": "microns",
    "conditions": {"temp_C": null, "temp_K": null},
    "metric_type": "measurement",
    "confidence": 0.9,
    "domain": "hardening",
    "composition": null
  },
  {
    "source_id": "核材料文档3",
    "page_or_fig": "Experimental",
    "evidence_span": "the majority of grains had basal-plane normals oriented 30 deg from the sheet-normal direction",
    "alloy_name": "Zircaloy-2",
    "composition_raw": null,
    "specimen_state": null,
    "process_step": null,
    "test_type": null,
    "property_name": "texture",
    "极简value": 30,
    "value_min": null,
    "value_max": null,
    "unit": "deg",
    "raw_unit": "deg",
    "conditions": {"temp_C": null, "temp_K": null},
    "metric_type": "measurement",
    "confidence": 0.9,
    "domain": "elastoplastic_model",
    "composition": null
  },
  {
    "source_id": "核材料文档3",
    "page_or_fig": "Abstract",
    "evidence_span": "极简The material exhibited strong dynamic strain aging which reached maximum intensity around 300 C",
    "alloy_name": "Zircaloy-2",
    "composition_raw": null,
    "specimen_state": null,
    "process_step": null,
    "test_type": null,
    "property_name": "strain aging intensity",
    "value": null,
    "value_min": null,
    "value_max": null,
    "unit": null,
    "raw_unit": null,
    "conditions": {"temp_C": 300, "temp_K": 573.15},
    "metric_type": "judgement",
    "confidence": 0.9,
    "domain": "hardening",
    "composition": null
  },
  {
    "source_id": "核材料文档3",
    "page_or_fig": "Results",
    "evidence_span": "Optical and electron microscope investigations of crept specimens revealed the formation of kink bands and a cell structure",
    "alloy_name": "Zircaloy-2",
    "composition_raw": null,
    "specimen_state": "crept",
    "process_step": null,
    "test_type": "microscopy",
    "property_name": "microstructure",
    "value": "kink bands and cell structure",
    "value_min": null,
    "value_max": null,
    "unit": null,
    "raw_unit": null,
    "conditions": {"temp_C": null, "temp_K": null},
    "metric_type": "observation",
    "confidence": 0.9,
    "domain": "hardening",
    "composition": null
  },
  {
    "source_id": "核材料文档3",
    "page_or_fig": "Results",
    "evidence_span": "little or no twinning",
    "alloy_name": "Zircaloy-2",
    "composition_raw": null,
    "specimen_state": "极简crept",
    "process_step": null,
    "test_type": "microscopy",
    "property_name": "twinning",
    "value": "little or no",
    "value_min": null,
    "value_max": null,
    "unit": null,
    "raw_unit": null,
    "conditions": {"temp_C": null, "temp_K": null},
    "metric_type": "observation",
    "confidence": 0.9,
    "domain": "elastoplastic_model",
    "composition": null
  }
]
"""

b_json = safe_json_loads(b_json_str)
b_json

[{'source_id': '核材料文档3',
  'page_or_fig': 'Table 1',
  'evidence_span': 'Zircaloy-2 Cold-rolled~ Batch A . . . 50% after annealing Sn 1.46 Fe 0.13 Cr 0.09 Ni 0.06 O 0.086',
  'alloy_name': 'Zircaloy-2',
  'composition_raw': 'Sn:1.46, Fe:0.13, Cr:0.09, Ni:0.06, O:0.086',
  'specimen_state': None,
  'process_step': 'annealing at 800°C followed by cold-rolling 50%',
  'test_type': None,
  'property_name': 'composition',
  'value': None,
  'value_min': None,
  'value_max': None,
  'unit': 'wt%',
  'raw_unit': 'wt%',
  'conditions': {'temp_C': None, 'temp_K': None},
  'metric_type': 'measurement',
  'confidence': 0.9,
  'domain': 'corrosion',
  'composition': [{'element': 'Sn', 'value': 1.46, 'unit': 'wt%'},
   {'element': 'Fe', 'value': 0.13, 'unit': 'wt%'},
   {'element': 'Cr', 'value': 0.09, 'unit': 'wt%'},
   {'element': 'Ni', 'value': 0.06, 'unit': 'wt%'},
   {'element': 'O', 'value': 0.086, 'unit': 'wt%'}]},
 {'source_id': '核材料文档3',
  'page_or_fig': 'Table 1',
  'evidence_span': 'Zirc

In [3]:
# 4) C：代码内完成：爆裂 → 九类占位 → 去重
c1 = explode_arrays(b_json)
c2 = ensure_nine_domains(c1)
c3 = dedupe(c2)

out_dir = './zr_output'
from pathlib import Path

out_dir = Path(out_dir)   # 确保是 Path 对象
source_id = '核材料文档3'

# 5) 导出
long_path = out_dir / f"{source_id}__long.csv"
write_csv(long_path, c3)

# 6) 按 domain 切表
by_domain = defaultdict(list)
for r in c3:
    by_domain[(r.get('domain') or 'unknown')].append(r)
for d, rows in by_domain.items():
    write_csv(out_dir / f"{source_id}__{d}.csv", rows)

In [4]:
# 7) 透视雏形（可选）：每组保留同 domain 的第一个值（演示用途）
pivot_rows = []
group_key = lambda r: (r.get("source_id"), r.get("specimen_state"), r.get("conditions_hash"))
grouped = defaultdict(list)
for r in c3:
    grouped[group_key(r)].append(r)
for key, grp in grouped.items():
    row = {
        "source_id": key[0],
        "specimen_state": key[1],
        "conditions_hash": key[2],
    }
    # 为每个 domain 取第一个非空 value
    for d in DOMAINS:
        vals = [g for g in grp if (g.get("domain") == d and g.get("value") is not None)]
        row[d] = vals[0]["value"] if vals else None
    pivot_rows.append(row)
write_csv(out_dir / f"{source_id}__pivot_demo.csv", pivot_rows)

In [5]:
with open('/home/zuozhuo/info-extract/output/核材料文档3/auto/核材料文档3.md') as f:
    md_text = f.read()

len(md_text) / 4

8585.75

In [6]:

COLUMN_MAP = {
    "source_id": "文献ID",
    "alloy_name": "合金名称",
    "specimen_state": "样品状态",
    "composition": "成分",
    "conditions_hash": "条件哈希",
    "page_or_fig": "页码/图表",
    "evidence_span": "证据片段",
    "process_step": "工艺步骤",
    "test_type": "测试类型",
    # 九类硬性指标
    "density": "密度",
    "specific_heat": "比热容",
    "thermal_conductivity": "热传导率",
    "elastoplastic_model": "弹塑性模型",
    "thermal_expansion": "热膨胀",
    "irradiation_creep": "辐照蠕变",
    "irradiation_swelling": "辐照肿胀",
    "corrosion": "腐蚀",
    "hardening": "硬化性能数据"
}

def export_wide_chinese(path: Path, rows: List[Dict[str, Any]]):
    """
    把长表 rows 转成宽表，导出 CSV，表头中文。
    """
    # 分组：每个组合对应一行
    grouped = defaultdict(list)
    group_key = lambda r: (r.get("source_id"), r.get("alloy_name"), r.get("specimen_state"), r.get("conditions_hash"))
    for r in rows:
        grouped[group_key(r)].append(r)

    wide_rows = []
    for key, grp in grouped.items():
        row = {
            "source_id": key[0],
            "alloy_name": key[1],
            "specimen_state": key[2],
            "conditions_hash": key[3],
            # 取第一个非空的通用信息
            "page_or_fig": grp[0].get("page_or_fig"),
            "evidence_span": grp[0].get("evidence_span"),
            "composition": grp[0].get("composition"),
            "process_step": grp[0].get("process_step"),
            "test_type": grp[0].get("test_type"),
        }
        # 把九类 domain 摊平到列
        for d in DOMAINS:
            vals = [g for g in grp if g.get("domain") == d and g.get("value") is not None]
            if vals:
                row[d] = vals[0]["value"]
            else:
                row[d] = None
        wide_rows.append(row)

    # 输出中文表头
    keys = list(wide_rows[0].keys()) if wide_rows else []
    header = [COLUMN_MAP.get(k, k) for k in keys]
    with path.open("w", newline="", encoding="utf-8") as f:
        w = csv.writer(f)
        w.writerow(header)
        for r in wide_rows:
            w.writerow([r.get(k) for k in keys])

wide_path = out_dir / f"{source_id}__wide.csv"
export_wide_chinese(wide_path, c3)

In [16]:
import os
import pandas as pd

base_dir = "/home/zuozhuo/info-extract/zr_output_v2"

# 1. 统计子文件夹数量
if not os.path.exists(base_dir):
    raise FileNotFoundError(f"路径不存在: {base_dir}")

subfolders = [f.path for f in os.scandir(base_dir) if f.is_dir()]
num_subfolders = len(subfolders)

# 2. 统计所有 final.csv 文件的总行数（不算表头），并保存路径
total_rows = 0
final_csv_files = []

for subfolder in subfolders:
    for file in os.listdir(subfolder):
        if file.endswith("final_v2.csv") or file.endswith("final.csv"):
            file_path = os.path.join(subfolder, file)
            final_csv_files.append(file_path)
            try:
                with open(file_path, "r", encoding="utf-8") as f:
                    row_count = sum(1 for _ in f) - 1  # 去掉表头
                    total_rows += max(row_count, 0)
            except Exception as e:
                print(f"读取文件失败: {file_path}, 错误: {e}")

print(f"子文件夹数量: {num_subfolders}")
print(f"找到的 final.csv 文件数量: {len(final_csv_files)}")
print(f"所有 final.csv 文件的总行数（不含表头）: {total_rows}")
print(f"平均每篇提取条数：{total_rows / num_subfolders if num_subfolders > 0 else 0:.2f}")

# 3. 合并所有 final.csv
if final_csv_files:
    dfs = []
    for f in final_csv_files:
        try:
            df = pd.read_csv(f, encoding="utf-8")
            dfs.append(df)
        except Exception as e:
            print(f"读取 {f} 失败: {e}")
    if dfs:
        merged_df = pd.concat(dfs, ignore_index=True)
        output_path = os.path.join(base_dir, "final.csv")
        merged_df.to_csv(output_path, index=False, encoding="utf-8")
        print(f"✅ 已合并所有 final.csv，输出路径: {output_path}")
    else:
        print("⚠️ 没有成功读取到任何 final.csv，无法合并")
else:
    print("⚠️ 没有找到任何 final.csv 文件")


子文件夹数量: 461
找到的 final.csv 文件数量: 399
所有 final.csv 文件的总行数（不含表头）: 4121
平均每篇提取条数：8.94
✅ 已合并所有 final.csv，输出路径: /home/zuozhuo/info-extract/zr_output_v2/final.csv


  merged_df = pd.concat(dfs, ignore_index=True)


In [None]:
import os

base_path = "/home/zuozhuo/info-extract/zr_output_v2"

# 统计子文件夹数量
subfolders = [f.path for f in os.scandir(base_path) if f.is_dir()]
num_subfolders = len(subfolders)

# 统计所有 CSV 的行数（不含表头）
total_rows = 0
csv_counts = {}

for folder in subfolders:
    csv_files = [f for f in os.listdir(folder) if f.endswith(".csv")]
    for csv_file in csv_files:
        file_path = os.path.join(folder, csv_file)
        with open(file_path, "r", encoding="utf-8") as f:
            row_count = sum(1 for _ in f) - 1  # 不含表头
        total_rows += row_count
        csv_counts[file_path] = row_count

print("子文件夹数量：", num_subfolders)
print("所有 CSV 的总行数（不含表头）：", total_rows)

total_rows / num_subfolders * 700


子文件夹数量： 134
所有 CSV 的总行数（不含表头）： 811


4236.567164179104

In [14]:
#!/usr/bin/env python3
import os
from pathlib import Path
import shutil

SRC = Path.home() / "info-extract/zr_pdfs_split"
OUT = Path.home() / "info-extract/zr_output_v2"
DST = Path.home() / "info-extract/zr_pdfs_split_resume"

DST.mkdir(parents=True, exist_ok=True)

# 1) 收集 zr_output 下的子文件夹名（仅一层）
out_dirnames = {
    p.name.strip()                # 去掉两端空白
    for p in OUT.iterdir()
    if p.is_dir()
}

# 2) 遍历 src 下的 PDF，判断其 stem 是否在 out_dirnames 中
missing = []
for pdf in SRC.glob("*.pdf"):
    stem = pdf.stem.strip()       # “文件名（不含 .pdf）”
    if stem not in out_dirnames:
        missing.append(pdf)

# 3) 复制缺失的 PDF 到目标目录（保留时间戳）
for pdf in missing:
    shutil.copy2(pdf, DST / pdf.name)

print(f"总计 {len(list(SRC.glob('*.pdf')))} 个 PDF；"
      f"缺失 {len(missing)} 个，已复制到：{DST}")
if missing:
    print("缺失清单：")
    for p in missing:
        print(" -", p.name)


总计 692 个 PDF；缺失 310 个，已复制到：/home/zuozhuo/info-extract/zr_pdfs_split_resume
缺失清单：
 - STP754-EB-Zirconium in the Nuclear Industry_5th Volume_015.pdf
 - STP551-EB-Zirconium in the Nuclear Industry_2nd Volume_003.pdf
 - STP681-EB-Zirconium in the Nuclear Industry_4th Volume_038.pdf
 - STP681-EB-Zirconium in the Nuclear Industry_4th Volume_029.pdf
 - STP633-EB-Zirconium in the Nuclear Industry_3rd Volume_006.pdf
 - STP939-EB-Zirconium in the Nuclear Industry_7th Volume_034.pdf
 - STP551-EB-Zirconium in the Nuclear Industry_2nd Volume_026.pdf
 - STP754-EB-Zirconium in the Nuclear Industry_5th Volume_019.pdf
 - STP824-EB-Zirconium in the Nuclear Industry_6th Volume_053.pdf
 - STP939-EB-Zirconium in the Nuclear Industry_7th Volume_025.pdf
 - STP551-EB-Zirconium in the Nuclear Industry_2nd Volume_001.pdf
 - STP458-EB-Zirconium in the Nuclear Industry_1st Volume_026.pdf
 - STP939-EB-Zirconium in the Nuclear Industry_7th Volume_042.pdf
 - STP551-EB-Zirconium in the Nuclear Industry_2nd Volume_025

In [3]:
# 图4：系统核对运行日志可视化（反思次数与置信度变化曲线）
# 说明：此代码“以终为始”生成一张可直接用于截图的图像，
# 展示一个完整任务周期内的：批次平均置信度（上升趋势）与反思次数（下降趋势）。
# 你可直接运行、查看并截图，或替换参数以生成不同风格的数据。

import numpy as np
import pandas as pd

from matplotlib import font_manager as fm, rcParams

for name in ["Noto Sans CJK SC","Noto Sans CJK JP","Noto Sans CJK TC",
             "Noto Serif CJK SC","SimHei","Microsoft YaHei","PingFang SC",
             "Heiti TC","Songti SC","WenQuanYi Zen Hei","WenQuanYi Micro Hei",
             "AR PL UMing CN","Source Han Sans SC","Source Han Serif SC"]:
    try:
        path = fm.findfont(name, fallback_to_default=False)
        if path:
            rcParams["font.sans-serif"] = [name]
            rcParams["font.family"] = "sans-serif"
            rcParams["axes.unicode_minus"] = False
            break
    except Exception:
        pass

import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import matplotlib.patches as patches

# 图5：结果输出界面截图（冲突项与证据链展示）
# 目标：生成一张“伪UI”风格的静态图片，左侧为核对结果表格（状态+置信度条），
# 右侧为所选条目的证据链面板（来源、页码、摘要）。便于直接截图放入报告。
#
# 说明：
# - 非图表性质的UI绘制，使用matplotlib基本形状与文本，采用灰度配色以适配打印。
# - 数据为合理合成，场景延续核材料/工程/医学的混合语料。


import numpy as np
import pandas as pd

# ----------------------
# 1) 构造合成数据
# ----------------------
rows = [
    {
        "id": "R-1027",
        "entity": "Zircaloy-4",
        "property": "腐蚀速率 @360℃/18MPa",
        "decision": "通过",
        "confidence": 0.93,
        "sources": 12,
        "evidence": [
            {"title": "IAEA-TECDOC-XXXX", "src": "IAEA", "page": "p.34", "snippet": "…corrosion rate measured ~0.12 mg/dm²·day at 360°C in high-pressure water…"},
            {"title": "Journal of Nucl. Mater. 2019", "src": "Elsevier", "page": "p.112", "snippet": "…Zircaloy-4 shows stable corrosion in 18 MPa water at 360°C…"},
            {"title": "CNKI 学位论文 2020", "src": "CNKI", "page": "p.87", "snippet": "…腐蚀速率集中在 0.10–0.13 mg/dm²·day 区间…"}
        ]
    },
    {
        "id": "R-1043",
        "entity": "Zircaloy-4",
        "property": "热导率 λ @25–600℃",
        "decision": "通过",
        "confidence": 0.92,
        "sources": 10,
        "evidence": [
            {"title": "Springer Materials 2018", "src": "Springer", "page": "p.21", "snippet": "…λ in the range of 12.3–18.5 W/m·K between 25–600°C…"},
            {"title": "IAEA Report 2022", "src": "IAEA", "page": "p.56", "snippet": "…consistent thermal conductivity curve for Zircaloy-4…"},
        ]
    },
    {
        "id": "R-1090",
        "entity": "UO₂",
        "property": "比热容 Cp @25–800℃",
        "decision": "通过",
        "confidence": 0.90,
        "sources": 9,
        "evidence": [
            {"title": "JNM 2021", "src": "Elsevier", "page": "p.203", "snippet": "…Cp increases with temperature, typical range 200–280 J/kg·K…"},
        ]
    },
    {
        "id": "R-1121",
        "entity": "UO₂",
        "property": "热导退化率（辐照后）",
        "decision": "待复核",
        "confidence": 0.68,
        "sources": 7,
        "evidence": [
            {"title": "OSTI-DOE Report", "src": "DOE/OSTI", "page": "p.14", "snippet": "…degradation estimated between 15%–22% post-irradiation…"},
            {"title": "CNKI 论文", "src": "CNKI", "page": "p.59", "snippet": "…退化率取决于通量与烧结密度，存在偏差…"},
        ]
    },
    {
        "id": "R-1179",
        "entity": "316L",
        "property": "屈服强度 @300℃",
        "decision": "冲突",
        "confidence": 0.52,
        "sources": 6,
        "evidence": [
            {"title": "GB/T XXXX-2020", "src": "国家标准", "page": "p.18", "snippet": "…屈服强度标准值约 170–190 MPa…"},
            {"title": "Conference Proc. 2018", "src": "Proceedings", "page": "p.77", "snippet": "…reported ~240 MPa at similar conditions; sample treatment differs…"},
        ]
    },
    {
        "id": "R-1203",
        "entity": "Inconel 718",
        "property": "高温蠕变率 @650℃",
        "decision": "通过",
        "confidence": 0.88,
        "sources": 8,
        "evidence": [
            {"title": "ASM Handbook", "src": "ASM", "page": "p.301", "snippet": "…creep rate within expected bounds at 650°C…"},
        ]
    },
]

df = pd.DataFrame([
    {"ID": r["id"], "实体": r["entity"], "属性": r["property"], "判定": r["decision"],
     "置信度": r["confidence"], "来源数": r["sources"]} for r in rows
])

# 选中展示的证据链对应条目（默认选择第一条）
selected_idx = 0
selected = rows[selected_idx]

# ----------------------
# 2) 绘制“伪UI”界面
# ----------------------
W, H = 12, 7  # 英寸
dpi = 160
fig = plt.figure(figsize=(W, H), dpi=dpi)
ax = plt.gca()
ax.set_xlim(0, 1200)
ax.set_ylim(0, 700)
ax.axis("off")

# 背景框
bg = patches.FancyBboxPatch((10, 10), 1180, 680, boxstyle="round,pad=0.8,rounding_size=12",
                            linewidth=1, edgecolor="0.6", facecolor="0.98")
ax.add_patch(bg)

# 顶部标题与副标题
ax.text(35, 650, "核对结果与证据链（系统输出示意）", fontsize=18, fontweight="bold", color="0.1")
ax.text(35, 625, "任务：多源一致性核对 | 选中条目将展示右侧证据链摘要", fontsize=12, color="0.3")

# 左侧结果表格面板框
left = patches.FancyBboxPatch((35, 60), 740, 550, boxstyle="round,pad=0.6,rounding_size=10",
                              linewidth=1, edgecolor="0.7", facecolor="1.0")
ax.add_patch(left)
ax.text(55, 590, "核对结果（摘要）", fontsize=14, fontweight="bold", color="0.2")

# 表头
cols = ["ID", "实体", "属性", "判定", "置信度", "来源数"]
col_x = [60, 170, 310, 600, 670, 720]  # 粗略列起点
for cx, c in zip(col_x, cols):
    ax.text(cx, 560, c, fontsize=11, color="0.25", fontweight="bold")

# 行高与绘制
row_y_start = 530
row_h = 78
for i, r in enumerate(rows):
    y = row_y_start - i * row_h
    # 行背景（斑马条纹）
    if i % 2 == 0:
        rect = patches.FancyBboxPatch((45, y-56), 720, 64, boxstyle="round,pad=0.2,rounding_size=6",
                                      linewidth=0.5, edgecolor="0.92", facecolor="0.985")
        ax.add_patch(rect)
    ax.text(60, y-10, r["id"], fontsize=10, color="0.2")
    ax.text(170, y-10, r["entity"], fontsize=10, color="0.2")
    ax.text(310, y-10, r["property"], fontsize=10, color="0.2")
    # 判定“状态标签”
    status = r["decision"]
    status_map = {"通过": ("通过", "0.2"), "冲突": ("冲突", "0.2"), "待复核": ("待复核", "0.2")}
    label, color = status_map.get(status, (status, "0.2"))
    tag = patches.FancyBboxPatch((595, y-30), 48, 20, boxstyle="round,pad=0.4,rounding_size=6",
                                 linewidth=0.8, edgecolor="0.6", facecolor="0.95")
    ax.add_patch(tag)
    ax.text(599, y-27, label, fontsize=9, color=color)
    # 置信度条
    bar_w = 80
    filled = int(bar_w * r["confidence"])
    bar_bg = patches.FancyBboxPatch((665, y-30), bar_w, 20, boxstyle="round,pad=0.4,rounding_size=6",
                                    linewidth=0.6, edgecolor="0.8", facecolor="0.97")
    bar_fg = patches.FancyBboxPatch((665, y-30), filled, 20, boxstyle="round,pad=0.4,rounding_size=6",
                                    linewidth=0, edgecolor=None, facecolor="0.3")
    ax.add_patch(bar_bg)
    ax.add_patch(bar_fg)
    ax.text(665+bar_w+6, y-27, f"{r['confidence']:.2f}", fontsize=9, color="0.25")
    # 来源数
    ax.text(725, y-10, str(r["sources"]), fontsize=10, color="0.2")

# 右侧证据链面板
right = patches.FancyBboxPatch((800, 60), 370, 550, boxstyle="round,pad=0.6,rounding_size=10",
                               linewidth=1, edgecolor="0.7", facecolor="1.0")
ax.add_patch(right)
ax.text(820, 590, "证据链（选中条目）", fontsize=14, fontweight="bold", color="0.2")
ax.text(820, 565, f"ID：{selected['id']} | 实体：{selected['entity']}", fontsize=11, color="0.3")
ax.text(820, 545, f"属性：{selected['property']}", fontsize=11, color="0.3")
ax.text(820, 525, f"判定：{selected['decision']} | 置信度：{selected['confidence']:.2f}", fontsize=11, color="0.3")

# 证据卡片
card_y = 490
for ev in selected["evidence"]:
    card = patches.FancyBboxPatch((815, card_y-70), 340, 78, boxstyle="round,pad=0.5,rounding_size=8",
                                  linewidth=0.8, edgecolor="0.75", facecolor="0.985")
    ax.add_patch(card)
    ax.text(830, card_y, f"来源：{ev['src']} | {ev['title']} | {ev['page']}", fontsize=10, color="0.25")
    # 摘要框
    snippet_box = patches.FancyBboxPatch((830, card_y-52), 310, 38, boxstyle="round,pad=0.3,rounding_size=6",
                                         linewidth=0.4, edgecolor="0.85", facecolor="0.99")
    ax.add_patch(snippet_box)
    ax.text(838, card_y-44, ev["snippet"], fontsize=9, color="0.35", wrap=True)
    card_y -= 95

# 页脚说明
ax.text(35, 28, "注：该界面为系统输出示意。状态与置信度为核对引擎计算结果，证据链来自外部检索与原文片段。", fontsize=9, color="0.4")

# 保存图片与数据
img_path = "./fig5_result_ui.png"
csv_path = "./fig5_table_data.csv"
df.to_csv(csv_path, index=False, encoding="utf-8")

plt.savefig(img_path, bbox_inches="tight")
plt.close()

(img_path, csv_path)



  plt.savefig(img_path, bbox_inches="tight")


('./fig5_result_ui.png', './fig5_table_data.csv')