In [1]:
import requests
from lxml import html
import json
import re
from urllib.parse import urljoin

# 设置请求头
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}

# 目标网址
url = 'https://www.zhouyi.cc/zhouyi/yijing64/4103.html'

def clean_text(text):
    """
    清洗文本：去除多余空白、换行、广告字符
    """
    if not text:
        return ""
    # 合并多个空白字符为单个空格
    text = re.sub(r'\s+', ' ', text)
    # 去除首尾空白
    text = text.strip()
    # 移除常见的无关字符
    text = re.sub(r'[^\u4e00-\u9fff\u3000-\u303f\uf000-\uffefa-zA-Z0-9\s\.\!\?\，\。\！\？\、\；\：\（\）\【\】]', '', text)
    return text

def extract_gua_content(url):
    """
    爬取并提取周易卦象内容
    """
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        response.encoding = 'utf-8'
        tree = html.fromstring(response.content)

        # 使用 XPath 提取 class="gua_wp" 的 div 内容
        content_divs = tree.xpath('//*[@class="gua_wp"]')
        
        if not content_divs:
            print("❌ 未找到 class='gua_wp' 的元素")
            return None

        # 通常只有一个gua_wp，取第一个
        main_div = content_divs[0]

        # 提取内部所有文本，但保留结构
        # 我们将按标题（如“周易第一卦详解”）来分割内容
        raw_text = main_div.text_content()
        
        # 更精细的提取：尝试保留部分结构
        # 我们可以按“标题 + 内容”模式来解析
        data = {
            "source_url": url,
            "sections": []
        }

        # 获取所有子元素
        children = main_div.xpath('.//*')
        current_section = {"title": "引言", "content": []}
        
        for elem in children:
            tag = elem.tag
            text = elem.text_content().strip() if elem.text_content() else ""
            text = clean_text(text)
            
            # 如果是 h2, h3 等标题标签，开始新章节
            if tag in ['h2', 'h3', 'h4', 'strong'] and len(text) > 0 and not text.startswith('![]'):
                # 保存上一章节
                if current_section["content"]:
                    data["sections"].append(current_section)
                # 开始新章节
                current_section = {"title": text, "content": []}
            elif tag == 'p' and len(text) > 0:
                # 段落内容
                current_section["content"].append(text)
            elif tag == 'br':
                # 换行，可以视为段落分隔
                if current_section["content"] and current_section["content"][-1] != "":
                    current_section["content"].append("")
        
        # 添加最后一个章节
        if current_section["content"]:
            data["sections"].append(current_section)

        # 进一步清洗：合并连续的短句，去除空内容
        cleaned_sections = []
        for section in data["sections"]:
            # 过滤掉空行和极短的无效行
            content_lines = [line for line in section["content"] if line and len(line) > 5]
            # 合并成一个段落（可选）
            # full_text = " ".join(content_lines)
            cleaned_sections.append({
                "title": section["title"],
                "content": content_lines  # 保留为列表，更利于训练
            })
        
        data["sections"] = cleaned_sections

        # 尝试提取卦名
        title_elem = tree.xpath('//title/text()')
        if title_elem:
            title = clean_text(title_elem[0])
            # 提取 "周易第X卦_乾卦(乾为天)_乾上乾下" 中的 "乾为天"
            match = re.search(r'\(([^)]+?)\)', title)
            if match:
                data["gua_name"] = match.group(1)
            else:
                data["gua_name"] = title.split('_')[1].split('(')[0] if '_' in title else "未知"

        return data

    except Exception as e:
        print(f"❌ 爬取或解析失败: {e}")
        return None

# ------------------ 主程序 ------------------
if __name__ == "__main__":
    print(f"正在爬取 {url} ...")
    data = extract_gua_content(url)

    if data:
        # 保存为 JSON 文件
        output_file = 'zhouyi_gua_training_data.json'
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=2)
        print(f"✅ 成功提取数据，已保存为 '{output_file}'")

        # 打印数据结构预览
        print(f"\n--- 数据预览 (卦名: {data.get('gua_name', '未知')}) ---")
        for section in data["sections"][:3]:  # 只显示前3个章节
            print(f"\n【{section['title']}】")
            for line in section["content"][:2]:  # 每个章节显示前2行
                print(f"  {line}")
            if len(section["content"]) > 2:
                print("  ...")
    else:
        print("❌ 未能获取数据。")

正在爬取 https://www.zhouyi.cc/zhouyi/yijing64/4103.html ...
✅ 成功提取数据，已保存为 'zhouyi_gua_training_data.json'

--- 数据预览 (卦名: 未知) ---
