In [2]:
import json, os
from typing import Dict, Any
from openai import OpenAI

# 你已有的 llm_api
def llm_api(prompt: str, system_prompt: str="You are a helpful assistant") -> str:
    client = OpenAI(api_key=os.getenv("DEEPSEEK_API_KEY"), base_url="https://api.deepseek.com")
    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": prompt},
        ],
        stream=False
    )
    return response.choices[0].message.content


# -------- 固定 JSON 数据结构 -------- #
SCHEMA = {
    "paper": {  # 论文基本信息
        "title": None,
        "doi": None,
        "year": None,
    },
    "materials": [  # 材料信息
        {
            "name": None,
            "composition": [],  # [{"element": "Zr", "fraction": 2.5, "unit": "wt%"}]
            "conditions": None  # 实验条件
        }
    ],
    "properties": {  # 九类性能（必须含 data 和 formula）
        "density": {"data": None, "formula": None},
        "specific_heat": {"data": None, "formula": None},
        "thermal_conductivity": {"data": None, "formula": None},
        "elastoplastic_model": {"data": None, "formula": None},
        "thermal_expansion": {"data": None, "formula": None},
        "irradiation_creep": {"data": None, "formula": None},
        "irradiation_swelling": {"data": None, "formula": None},
        "corrosion": {"data": None, "formula": None},
        "hardening": {"data": None, "formula": None}
    }
}

HARD_PROPS = list(SCHEMA["properties"].keys())


# -------- 方案一：一次性抽取 -------- #
def extract_once(md_text: str) -> Dict[str, Any]:
    prompt = f"""
请从下面的核材料文献 Markdown 中抽取以下信息，并输出严格遵守下列 JSON 结构的结果：

{json.dumps(SCHEMA, ensure_ascii=False, indent=2)}

要求:
- 九类性能每个属性必须有 "data" 和 "formula" 两个 key
- 若文献中没有对应数据，保持 null
- 直接输出 JSON，不要附加解释

文献内容:
{md_text}
"""
    resp = llm_api(prompt, system_prompt="You are a nuclear materials information extraction assistant.")
    try:
        return json.loads(resp)
    except Exception:
        print("⚠️ LLM 输出不是严格 JSON，原文：", resp[:500])
        return {"raw": resp}


# -------- 方案二：逐项检查 -------- #
def extract_with_reflection(md_text: str) -> Dict[str, Any]:
    # 先跑一次
    first_pass = extract_once(md_text)
    final_result = first_pass.copy()

    # 逐项 check 九类性能
    for prop in HARD_PROPS:
        prompt = f"""
我们已有第一版抽取结果，请重点检查性能项“{prop}”是否正确。

要求:
- 重新检查文献内容，若有遗漏或错误请修正
- 输出的 JSON 必须符合以下结构，只修改 {prop} 对应的值
- {prop} 内必须包含 "data" 和 "formula"，若没有则为 null

结构示例:
"{prop}": {{"data": null, "formula": null}}

文献内容:
{md_text}

第一版结果:
{json.dumps(first_pass, ensure_ascii=False)}
"""
        resp = llm_api(prompt, system_prompt="You are a nuclear materials information extraction assistant.")
        try:
            checked = json.loads(resp)
            if "properties" in checked and prop in checked["properties"]:
                final_result["properties"][prop] = checked["properties"][prop]
        except Exception:
            print(f"⚠️ {prop} 检查失败，原文：", resp[:200])
    return final_result


# ---------------- 测试 ---------------- #
if __name__ == "__main__":
    with open('/home/zuozhuo/info-extract/output/核材料文档3/auto/核材料文档3.md') as f:
        md_text = f.read()

    print("===== 方案一：一次性抽取 =====")
    once = extract_once(md_text)
    print(json.dumps(once, ensure_ascii=False, indent=2))

    print("\n===== 方案二：逐项检查 =====")
    reflected = extract_with_reflection(md_text)
    print(json.dumps(reflected, ensure_ascii=False, indent=2))


===== 方案一：一次性抽取 =====
{
  "paper": {
    "title": "The Effect of Texture and Strain Aging on Creep of Zircaloy-2",
    "doi": null,
    "year": "1969"
  },
  "materials": [
    {
      "name": "Zircaloy-2",
      "composition": [
        "Sn: 1.46 wt%",
        "Fe: 0.13 wt%",
        "Cr: 0.09 wt%",
        "Ni: 0.06 wt%",
        "O: 0.086 wt%"
      ],
      "conditions": "Cold-rolled 50% after annealing at 800°C"
    },
    {
      "name": "Zircaloy-2",
      "composition": [
        "Sn: 1.39 wt%",
        "Fe: 0.14 wt%",
        "Cr: 0.10 wt%",
        "Ni: 0.05 wt%",
        "O: 0.101 wt%"
      ],
      "conditions": "Cold-rolled 40% after annealing at 760°C"
    }
  ],
  "properties": {
    "density": {
      "data": null,
      "formula": null
    },
    "specific_heat": {
      "data": null,
      "formula": null
    },
    "thermal_conductivity": {
      "data": null,
      "formula": null
    },
    "elastoplastic_model": {
      "data": null,
      "formula": null
    },


## 反思结果：

- ===== 方案一：一次性抽取 =====
```
{
  "paper": {
    "title": "The Effect of Texture and Strain Aging on Creep of Zircaloy-2",
    "doi": null,
    "year": "1969"
  },
  "materials": [
    {
      "name": "Zircaloy-2",
      "composition": [
        "Sn: 1.46 wt%",
        "Fe: 0.13 wt%",
        "Cr: 0.09 wt%",
        "Ni: 0.06 wt%",
        "O: 0.086 wt%"
      ],
      "conditions": "Cold-rolled 50% after annealing at 800°C"
    },
    {
      "name": "Zircaloy-2",
      "composition": [
        "Sn: 1.39 wt%",
        "Fe: 0.14 wt%",
        "Cr: 0.10 wt%",
        "Ni: 0.05 wt%",
        "O: 0.101 wt%"
      ],
      "conditions": "Cold-rolled 40% after annealing at 760°C"
    }
  ],
  "properties": {
    "density": {
      "data": null,
      "formula": null
    },
    "specific_heat": {
      "data": null,
      "formula": null
    },
    "thermal_conductivity": {
      "data": null,
      "formula": null
    },
    "elastoplastic_model": {
      "data": null,
      "formula": null
    },
    "thermal_expansion": {
      "data": null,
      "formula": null
    },
    "irradiation_creep": {
      "data": null,
      "formula": null
    },
    "irradiation_swelling": {
      "data": null,
      "formula": null
    },
    "corrosion": {
      "data": null,
      "formula": null
    },
    "hardening": {
      "data": "Dynamic strain aging with maximum intensity around 300°C",
      "formula": null
    }
  }
}
```

- ===== 方案二：逐项检查 =====
```
{
  "paper": {
    "title": "The Effect of Texture and Strain Aging on Creep of Zircaloy-2",
    "doi": null,
    "year": 1969
  },
  "materials": [
    {
      "name": "Zircaloy-2",
      "composition": [
        "Sn: 1.46-1.39 wt%",
        "Fe: 0.13-0.14 wt%",
        "Cr: 0.09-0.10 wt%",
        "Ni: 0.05-0.06 wt%",
        "O: 0.086-0.101 wt%"
      ],
      "conditions": "Cold-rolled (40-50% reduction) and annealed (760-800°C) sheet material with texture (basal-plane normals oriented 20-30 deg from sheet-normal direction)"
    }
  ],
  "properties": {
    "density": {
      "data": null,
      "formula": null
    },
    "specific_heat": {
      "data": null,
      "formula": null
    },
    "thermal_conductivity": {
      "data": null,
      "formula": null
    },
    "elastoplastic_model": {
      "data": null,
      "formula": null
    },
    "thermal_expansion": {
      "data": null,
      "formula": null
    },
    "irradiation_creep": {
      "data": null,
      "formula": null
    },
    "irradiation_swelling": {
      "data": null,
      "formula": null
    },
    "corrosion": {
      "data": null,
      "formula": null
    },
    "hardening": {
      "data": "Dynamic strain aging with maximum intensity around 300°C, causing creep rate reduction and complete cessation in some tests",
      "formula": "ϵ = A t^m + c (creep strain-time relationship)"
    }
  }
}
```