In [None]:
import os
import csv
import logging
from lxml import etree
from tqdm import tqdm

# ====== 在脚本顶部设置好 stkcd ======
stkcd = '000025'

class GubaParser:
    def __init__(self):
        # 输出 CSV 路径
        self.output_file = f"data/{stkcd}股吧帖子.csv"
        # 去重用集合（这里用 URL 做标识）
        self.processed_urls = set()
        # 序号
        self.current_id = 0
        # 初始化年份和上一条的(月, 日)
        self.current_year = 2025
        self.prev_month_day = None

        # 日志配置
        logging.basicConfig(
            level=logging.ERROR,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler('parser.log', encoding='utf-8'),
                logging.StreamHandler()
            ]
        )

        # 如果 CSV 不存在就创建并写入表头，否则加载已有数据用于去重并恢复 current_id
        if not os.path.exists(self.output_file):
            os.makedirs(os.path.dirname(self.output_file), exist_ok=True)
            with open(self.output_file, 'w', encoding='utf-8-sig', newline='') as f:
                writer = csv.writer(f)
                writer.writerow([
                    "序号", "标题", "评论数", "阅读数",
                    "作者", "作者主页", "更新时间", "URL"
                ])
        else:
            self._load_existing()

    def _load_existing(self):
        """加载已有 CSV，用于去重和恢复 current_id"""
        try:
            with open(self.output_file, 'r', encoding='utf-8-sig', newline='') as f:
                reader = csv.reader(f)
                next(reader)  # 跳过表头
                for row in reader:
                    idx = int(row[0])
                    url = row[-1]
                    self.current_id = max(self.current_id, idx)
                    self.processed_urls.add(url)
        except Exception as e:
            logging.error(f"加载已有数据失败：{e}")

    def adjust_year(self, raw_update):
        """
        输入 raw_update 形如 "MM-DD hh:mm"，
        输出 "YYYY-MM-DD hh:mm"，并在检测到“前两条为1月且当前为12月”时回退年份。
        """
        try:
            date_part, time_part = raw_update.split(' ')
            month, day = map(int, date_part.split('-'))
        except Exception:
            return raw_update

        # 判断是否跨年：仅当当前是12月，且前五条记录的月份都是1月
        if hasattr(self, 'prev_month_days') and len(self.prev_month_days) >= 5:
            if month == 12 and all(m == 1 for m, _ in self.prev_month_days[-5:]):
                self.current_year -= 1

        # 更新前两条历史
        if not hasattr(self, 'prev_month_days'):
            self.prev_month_days = []
        self.prev_month_days.append((month, day))
        if len(self.prev_month_days) > 5:
            self.prev_month_days.pop(0)

        return f"{self.current_year}-{month:02d}-{day:02d} {time_part}"



    def parse_html(self, file_path):
        """解析单个 HTML，返回列表——每项是完整的一行 CSV 数据（含序号）。"""
        posts = []
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                html = f.read()
            tree = etree.HTML(html)
            for tr in tree.xpath('//tbody[@class="listbody"]/tr'):
                try:
                    title = tr.xpath('.//div[@class="title"]//a/text()')[0].strip()
                    href = tr.xpath('.//div[@class="title"]//a/@href')[0]
                    url = href if href.startswith('http') else f"https://guba.eastmoney.com{href}"

                    author = tr.xpath('.//div[@class="author"]//a/text()')[0].strip()
                    ahref = tr.xpath('.//div[@class="author"]//a/@href')[0]
                    author_url = ahref if ahref.startswith('http') else f"https://guba.eastmoney.com{ahref}"

                    read_count = tr.xpath('.//div[@class="read"]/text()')[0]
                    reply_count = tr.xpath('.//div[@class="reply"]/text()')[0]
                    raw_update = tr.xpath('.//div[@class="update"]/text()')[0]

                    full_update = self.adjust_year(raw_update)

                    # 去重：只要 URL 不在 processed_urls 中就算新
                    if url not in self.processed_urls:
                        self.processed_urls.add(url)
                        self.current_id += 1
                        row = [
                            self.current_id,
                            title,
                            reply_count,
                            read_count,
                            author,
                            author_url,
                            full_update,
                            url
                        ]
                        posts.append(row)
                except Exception as e:
                    logging.error(f"解析单条帖子失败 ({file_path})：{e}")
        except Exception as e:
            logging.error(f"打开或解析文件失败 ({file_path})：{e}")
        return posts

    def save_posts(self, posts):
        """把新解析到的帖子追加到 CSV"""
        if not posts:
            return
        try:
            with open(self.output_file, 'a', encoding='utf-8-sig', newline='') as f:
                writer = csv.writer(f)
                writer.writerows(posts)
        except Exception as e:
            logging.error(f"保存数据失败：{e}")

    def run(self, html_dir="data/pages"):
        """按顺序遍历 html_dir 下所有 .html 文件，依次解析并保存。"""
        if not os.path.isdir(html_dir):
            print(f"目录不存在：{html_dir}")
            return

        # 按文件名中的页码排序，如 000025_20.html, 000025_21.html
        files = sorted(
            [f for f in os.listdir(html_dir) if f.endswith('.html')],
            key=lambda fn: int(os.path.splitext(fn)[0].split('_')[-1])
        )

        print(f"共 {len(files)} 个文件，开始依次解析……")
        for fn in tqdm(files, desc="解析进度"):
            fp = os.path.join(html_dir, fn)
            posts = self.parse_html(fp)
            self.save_posts(posts)

        print("全部解析完成，结果保存在：", self.output_file)

if __name__ == "__main__":
    parser = GubaParser()
    # 如果你的 HTML 不在默认的 data/pages 下，可以传入其他路径：
    parser.run(html_dir="data/pages")
