# PDF2KG 数据处理流程（优化版）

本 notebook 展示从 PDF 文件到知识图谱的完整处理流程

## 优化特性
- ✅ 多线程并发处理（5倍速度提升）
- ✅ 磁盘缓存（重复运行秒级响应）
- ✅ DeepSeek JSON Output（更可靠的输出）

In [ ]:
import subprocess
import time

# 检查 Docker 是否运行
try:
    subprocess.run(["docker", "ps"], capture_output=True, check=True)
    print("✓ Docker 正在运行")
except:
    print("✗ 请先启动 Docker")
    raise

# 检查容器是否存在
result = subprocess.run(
    ["docker", "ps", "-a", "--filter", "name=neo4j-kg", "--format", "{{.Names}}"],
    capture_output=True, text=True
)

if "neo4j-kg" in result.stdout:
    subprocess.run(["docker", "start", "neo4j-kg"], check=True)
    print("✓ Neo4j 容器已启动")
else:
    subprocess.run([
        "docker", "run", "-d", "--name", "neo4j-kg",
        "-p", "7474:7474", "-p", "7687:7687",
        "-e", "NEO4J_AUTH=neo4j/password123",
        "neo4j:latest"
    ], check=True)
    print("✓ Neo4j 容器已创建并启动")

print("\n等待 Neo4j 启动...")
time.sleep(15)
print("✓ Neo4j 已就绪: http://localhost:7474 (neo4j/password123)")

In [None]:
%load_ext autoreload
%autoreload 2

# 强制清除缓存
import sys
for module in list(sys.modules.keys()):
    if module.startswith('src.'):
        del sys.modules[module]

import json
from pathlib import Path
from dotenv import load_dotenv

load_dotenv()

from src.pdf_parser import extract_text_from_pdf, chunk_text
from src.kg_builder import KGBuilder
from src.visualizer import KGVisualizer

In [None]:
%load_ext autoreload
%autoreload 2

import json
from pathlib import Path
from dotenv import load_dotenv

load_dotenv()

from src.pdf_parser import extract_text_from_pdf, chunk_text
from src.kg_builder import KGBuilder
from src.visualizer import KGVisualizer

## 2. PDF 文本提取

In [None]:
# 设置 PDF 文件路径
pdf_path = "data/your_document.pdf"

text = extract_text_from_pdf(pdf_path)
print(f"提取完成！文本长度: {len(text)} 字符")
print(f"\n前 300 字符预览:\n{text[:300]}...")

## 3. 文本分块

In [None]:
chunks = chunk_text(text, chunk_size=2000, overlap=200)
print(f"文本已分成 {len(chunks)} 个块")

# 创建可视化器并存储到 Neo4j
visualizer = KGVisualizer()
visualizer.build_graph(kg_data)

# 显示统计信息
stats = visualizer.get_stats()
print("图谱统计信息:")
for key, value in stats.items():
    print(f"  {key}: {value}")

# 存储到 Neo4j 并可视化
visualizer.visualize()

In [None]:
# 初始化构建器（max_workers=5 表示5个并发线程）
kg_builder = KGBuilder(max_workers=5)

# 构建知识图谱（多线程处理，自动缓存）
print("开始构建知识图谱...")
kg_data = kg_builder.build_kg_from_chunks(chunks)

print(f"\n✓ 构建完成！")
print(f"实体数: {len(kg_data['entities'])}")
print(f"关系数: {len(kg_data['relations'])}")

## 5. 查看提取结果

In [None]:
print("前 10 个实体:")
for entity in kg_data['entities'][:10]:
    print(f"  - {entity['id']} ({entity.get('type', 'unknown')})")

print("\n前 10 个关系:")
for relation in kg_data['relations'][:10]:
    print(f"  - {relation['source']} --[{relation.get('type', 'related')}]--> {relation['target']}")

## 6. 保存结果

In [None]:
output_dir = Path("output")
output_dir.mkdir(exist_ok=True)

with open(output_dir / "knowledge_graph.json", 'w', encoding='utf-8') as f:
    json.dump(kg_data, f, ensure_ascii=False, indent=2)

print(f"✓ 已保存到: output/knowledge_graph.json")

## 7. 可视化

In [None]:
visualizer = KGVisualizer()
visualizer.build_graph(kg_data)

print("图谱统计:")
for k, v in visualizer.get_stats().items():
    print(f"  {k}: {v}")

# NetworkX 可视化
visualizer.visualize(output_path="output/knowledge_graph.png", max_nodes=100)

# 存储到 Neo4j（可选）
visualizer.store_to_neo4j(neo4j_password="password123")