In [15]:
import time
from datetime import datetime, timedelta
import re
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import emoji

In [None]:
#抓取人氣討論頁面的文章ID，並儲存到CSV列表

#Selenium設定
options = webdriver.ChromeOptions()
options.add_argument("--headless")  # 無頭模式
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
driver = webdriver.Chrome(options=options)

# 設定目標 URL
url = "https://www.cmoney.tw/forum/popular/buzz?tab=popular"
driver.get(url)
time.sleep(5)  # 等待網頁加載

# 滑動頁面以加載所有內容
for _ in range(5):  # 根據頁面大小調整滑動次數
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(2)

# 抓取文章ID
article_links = driver.find_elements(By.CSS_SELECTOR, 'a[href^="/forum/article/"][data-v-57c2c234]')
article_ids = list(set([link.get_attribute("href").split("/")[-1] for link in article_links]))  # 確保ID唯一

print(f"找到文章數量：{len(article_ids)} 篇")
print(f"文章ID列表：{article_ids}")

# 關閉驅動
driver.quit()

# 將ID存成CSV檔
output_file = "article_ids.csv"

try:
    # 如果文件已存在，讀取已存在的ID
    existing_data = pd.read_csv(output_file)
    existing_ids = set(existing_data["article_id"].tolist())
    print(f"已有 {len(existing_ids)} 篇文章ID")
except FileNotFoundError:
    # 如果文件不存在，初始化空的ID集合
    existing_ids = set()

# 添加新ID，並確保不重複
new_ids = set(article_ids) - existing_ids
print(f"新增 {len(new_ids)} 篇文章ID")

# 將新ID添加到文件
if new_ids:
    new_data = pd.DataFrame({"article_id": list(new_ids)})
    new_data.to_csv(output_file, mode="a", header=not existing_ids, index=False, encoding="utf-8-sig")

print(f"文章ID已儲存到 {output_file}")



找到文章數量：60 篇
文章ID列表：['167698929', '167717864', '167710582', '167698998', '167732423', '167722567', '167717191', '167701412', '167708500', '167700197', '167708511', '167720214', '167720553', '167724392', '167707412', '167724413', '167723549', '167708675', '167709490', '167706368', '167697843', '167706956', '167709442', '167728336', '167703514', '167729875', '167721158', '167707218', '167699437', '167697065', '167693551', '167695080', '167728895', '167699925', '167715475', '167697936', '167696104', '167701711', '167700797', '167730781', '167728052', '167693530', '167711400', '167712314', '167719000', '167698904', '167727666', '167719817', '167701783', '167703126', '167698994', '167721479', '167718948', '167698120', '167709685', '167730948', '167726025', '167722441', '167712261', '167693824']
新增 60 篇文章ID
文章ID已儲存到 article_ids.csv


In [None]:
#Selenium設定
options = webdriver.ChromeOptions()
options.add_argument("--headless")  
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
driver = webdriver.Chrome(options=options)

#讀取文章ID列表
input_file = "article_ids.csv"  
article_ids = pd.read_csv(input_file)["article_id"].tolist()

#儲存
data = []

#處理每篇文章
for article_index, article_id in enumerate(article_ids[:50]):  
    try:
        print(f"正在處理第 {article_index + 1} 篇文章 (ID: {article_id})...")
        article_url = f"https://www.cmoney.tw/forum/article/{article_id}"
        driver.get(article_url)
        time.sleep(5)  # 等待頁面加載

        # 檢查是否有付費解鎖標籤
        try:
            paywall_tag = driver.find_elements(By.CSS_SELECTOR, "div.articlePayPoint__text")
            if paywall_tag:
                print(f"文章 {article_id} 需要付費解鎖，跳過...")
                continue
        except Exception as e:
            print(f"檢查付費標籤失敗: {e}")

        # 抓取作者名稱
        try:
            author_tag = driver.find_elements(By.CSS_SELECTOR, "span[data-v-57c2c234], h1[data-v-57c2c234]")
            if author_tag:
                author = emoji.replace_emoji(author_tag[0].text.strip(), replace="")  #移除emoji
            else:
                author = "未知作者"
        except Exception as e:
            author = "未知作者"
            print(f"作者抓取失敗: {e}")

        # 抓取發布時間
        try:
            time_tag = driver.find_element(By.CSS_SELECTOR, f'a[href="/forum/article/{article_id}"]').text.strip()
            current_weekday = datetime.now().weekday()  # 取得當前星期幾 (0=星期一, 6=星期日)

            if "小時前" in time_tag:
                hours_ago = int(re.search(r"(\d+)小時前", time_tag).group(1))
                publish_time = (datetime.now() - timedelta(hours=hours_ago)).strftime("%Y-%m-%d %H:%M:%S")
            elif "昨天" in time_tag:
                time_part = re.search(r"昨天 (\d{2}:\d{2})", time_tag).group(1)
                publish_time = (datetime.now() - timedelta(days=1)).strftime("%Y-%m-%d") + f" {time_part}:00"
            elif re.match(r"\d{4}年\d{2}月\d{2}日 \d{2}:\d{2}", time_tag):
                publish_time = datetime.strptime(time_tag, "%Y年%m月%d日 %H:%M").strftime("%Y-%m-%d %H:%M:%S")
            elif re.match(r"星期[一二三四五六日] \d{2}:\d{2}", time_tag):
                # 解析 "星期X HH:MM"
                match = re.search(r"星期([一二三四五六日]) (\d{2}:\d{2})", time_tag)
                week_map = {"一": 0, "二": 1, "三": 2, "四": 3, "五": 4, "六": 5, "日": 6}
                target_weekday = week_map[match.group(1)]  # 對應的星期幾
                time_part = match.group(2)  # 時間部分

                # 計算日期差
                delta_days = (current_weekday - target_weekday) % 7
                if delta_days == 0:  # 如果是今天的星期
                    delta_days = 7
                publish_time = (datetime.now() - timedelta(days=delta_days)).strftime("%Y-%m-%d") + f" {time_part}:00"
            else:
                publish_time = "未知時間"
        except Exception as e:
            publish_time = "未知時間"
            print(f"時間抓取失敗: {e}")

        # 抓取股票標籤
        try:
            stock_tags = driver.find_elements(By.CSS_SELECTOR, "h2.articleTags__text")
            stocks = [tag.text.strip() for tag in stock_tags]
        except Exception as e:
            stocks = []
            print(f"股票標籤抓取失敗: {e}")

        # 抓取文章內容
        try:
            # 檢查是否有 "繼續閱讀" 按鈕
            read_more_button = driver.find_elements(By.CSS_SELECTOR, "span.btn")
            if read_more_button:
                read_more_button[0].click()  # 點擊 "繼續閱讀"
                time.sleep(2)
                content_tag = driver.find_element(By.CSS_SELECTOR, "pre.textRule__text")
            else:
                content_tag = driver.find_element(By.CSS_SELECTOR, "div.articleContent__text>div.textRule>div.textRule__body>div.textRule__content>pre.textRule__text")
            content = emoji.replace_emoji(content_tag.text.strip(), replace="")  # 移除 emoji
        except Exception as e:
            content = "無內容"
            print(f"文章內容抓取失敗: {e}")

        # 抓取留言
        try:
            comments = []
            while True:
                comment_tags = driver.find_elements(By.CSS_SELECTOR, "pre.undefined")
                comments.extend([emoji.replace_emoji(comment.text.strip(), replace="") for comment in comment_tags])
                load_more_button = driver.find_elements(By.CSS_SELECTOR, "button.replyRespond__more")
                if not load_more_button:
                    break
                load_more_button[0].click()
                time.sleep(2)
        except Exception as e:
            comments = []
            print(f"留言抓取失敗: {e}")

        # 儲存數據（新增ID欄位）
        data.append({
            "ID": article_id,
            "author": author,
            "publish_time": publish_time,
            "stocks": ", ".join(stocks),  # 股票標籤以逗號分隔
            "content": content,
            "comments": comments  # 留言以列表形式存儲
        })

    except Exception as e:
        print(f"Error processing article {article_id}: {e}")

# 關閉驅動
driver.quit()

# 儲存到 CSV
output_file = "cmoney_articles.csv"
df = pd.DataFrame(data)
df.to_csv(output_file, index=False, encoding="utf-8-sig")
print(f"抓取完成，已儲存到 {output_file}")


正在處理第 1 篇文章 (ID: 167694057)...
正在處理第 2 篇文章 (ID: 167698998)...
正在處理第 3 篇文章 (ID: 167726149)...
正在處理第 4 篇文章 (ID: 167722567)...
正在處理第 5 篇文章 (ID: 167707089)...
正在處理第 6 篇文章 (ID: 167713377)...
正在處理第 7 篇文章 (ID: 167723941)...
正在處理第 8 篇文章 (ID: 167700197)...
正在處理第 9 篇文章 (ID: 167708511)...
正在處理第 10 篇文章 (ID: 167720214)...
正在處理第 11 篇文章 (ID: 167720553)...
正在處理第 12 篇文章 (ID: 167707412)...
正在處理第 13 篇文章 (ID: 167712783)...
正在處理第 14 篇文章 (ID: 167727020)...
正在處理第 15 篇文章 (ID: 167712097)...
正在處理第 16 篇文章 (ID: 167721518)...
正在處理第 17 篇文章 (ID: 167728748)...
文章 167728748 需要付費解鎖，跳過...
正在處理第 18 篇文章 (ID: 167699824)...
正在處理第 19 篇文章 (ID: 167706956)...
正在處理第 20 篇文章 (ID: 167700893)...
正在處理第 21 篇文章 (ID: 167728336)...
正在處理第 22 篇文章 (ID: 167728949)...
正在處理第 23 篇文章 (ID: 167721158)...
正在處理第 24 篇文章 (ID: 167707218)...
正在處理第 25 篇文章 (ID: 167698351)...
正在處理第 26 篇文章 (ID: 167722989)...
正在處理第 27 篇文章 (ID: 167728895)...
正在處理第 28 篇文章 (ID: 167698099)...
正在處理第 29 篇文章 (ID: 167711988)...
正在處理第 30 篇文章 (ID: 167731384)...
正在處理第 31 篇文章 (ID: 16770