In [1]:
import csv
import datetime
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


def scrape_articles(board, start_date=None, end_date=None):
    # 設定Chrome Driver的執行檔路徑
    options = Options()
    options.chrome_executable_path = "C:/Users/05731/OneDrive/桌面/crawler/chromedriver.exe"

    # 建立Driver物件實體，用程式操作瀏覽器運作
    driver = webdriver.Chrome(options=options)

    # 設定起始頁面URL
    base_url = f"https://www.ptt.cc/bbs/{board}/index.html"
    driver.get(base_url)

    # 等待同意條款視窗出現
    WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.XPATH, "//button[@name='yes']")))

    # 點擊同意條款連結
    agree_button = driver.find_element(By.XPATH, "//button[@name='yes']")
    agree_button.click()

    # 等待文章元素加載
    WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CLASS_NAME, "title")))

    # 取得當前日期
    current_date = datetime.date.today().strftime("%Y-%m-%d")

    # 設定CSV檔案名稱
    filename = f"{board}_articles_{current_date}.csv"

    # 建立CSV檔案並寫入標題行
    with open(filename, "w", encoding="utf-8", newline="") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["標題", "發文時間", "作者", "內容", "留言"])  # 加入留言欄位

        # 爬取文章
        page_url = base_url
        while True:
            # 取得文章元素列表
            article_elements = driver.find_elements(By.CLASS_NAME, "title")

            for article_element in article_elements:
                # 取得文章標題
                title = article_element.text.strip()

                # 點擊文章連結進入文章頁面
                title_link = article_element.find_element(By.CSS_SELECTOR, "a")
                link = title_link.get_attribute("href")
                driver.execute_script("window.open(arguments[0]);", link)
                driver.switch_to.window(driver.window_handles[-1])

                # 等待文章頁面加載
                WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.ID, "main-content")))

                print("標題:", title)
                print("連結:", link)

                # 取得文章發文時間、作者和內容
                post_time_element = driver.find_element(By.CSS_SELECTOR, "div.article-metaline:nth-child(4) span.article-meta-value")
                post_time = post_time_element.text.strip()
                author_element = driver.find_element(By.CSS_SELECTOR, "div.article-metaline:nth-child(1) span.article-meta-value")
                author = author_element.text.strip()

                content_element = driver.find_element(By.ID, "main-content")
                content = content_element.text.strip()

                # 精簡內容
                content = content.replace(author, "").replace(post_time, "").replace(title, "")
                lines = content.strip().splitlines()
                cleaned_content = " ".join(line for line in lines if line.strip())
                cleaned_content = re.sub(r'//.*', '', cleaned_content)

                # 取得留言內容
                comments_elements = driver.find_elements(By.CSS_SELECTOR, "div.push span.push-content")
                comments = [comment.text.strip() for comment in comments_elements]

                # 檢查是否有隱藏的留言
                hidden_comments_elements = driver.find_elements(By.CSS_SELECTOR, "div.push span.push-content.hidden")
                for hidden_element in hidden_comments_elements:
                    driver.execute_script("arguments[0].style.display = 'block';", hidden_element)
                    comments.append(hidden_element.text.strip())

                # 寫入CSV檔案
                writer.writerow([title, post_time, author, cleaned_content, "\n".join(comments)])  # 將留言內容串接成字串

                driver.close()
                driver.switch_to.window(driver.window_handles[0])

            # 翻頁
            previous_page = driver.find_element(By.LINK_TEXT, "‹ 上頁")
            page_url = previous_page.get_attribute("href")
            driver.get(page_url)

            current_date = page_url.split("/")[-1].replace("index", "").replace(".html", "")

            if start_date and end_date:
                if start_date > current_date or current_date > end_date:
                    break

            WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.LINK_TEXT, "‹ 上頁")))

    driver.quit()


# 指定看板名稱，起始日期和結束日期
scrape_articles("Gossiping", start_date="2023-07-04", end_date="2023-07-05")


標題: [新聞] 志願役招募不順 立委馬文君提議：當兵加
連結: https://www.ptt.cc/bbs/Gossiping/M.1688541358.A.B09.html
標題: [問卦] 館長：我的便當在林口沒對手
連結: https://www.ptt.cc/bbs/Gossiping/M.1688541451.A.C07.html
標題: [問卦] 咖啡杯多久洗一次
連結: https://www.ptt.cc/bbs/Gossiping/M.1688541484.A.826.html
標題: Re: [新聞] 沖繩知事反對「台灣有事即日本有事」 外
連結: https://www.ptt.cc/bbs/Gossiping/M.1688541502.A.2FF.html
標題: Re: [新聞] 白宮「不明粉末」竟是古柯鹼！　當局展開
連結: https://www.ptt.cc/bbs/Gossiping/M.1688541598.A.A1B.html
標題: [問卦] 統一發票中獎可以去超商換錢嗎??
連結: https://www.ptt.cc/bbs/Gossiping/M.1688541634.A.6DE.html
標題: [問卦] 環球影城PASS門票一張五千真的有人會買?
連結: https://www.ptt.cc/bbs/Gossiping/M.1688541637.A.DCA.html
標題: Re: [爆卦] 郭董確定參加716遊行
連結: https://www.ptt.cc/bbs/Gossiping/M.1688541682.A.24D.html
標題: [問卦] 比有錢台灣人還是贏中國人啊
連結: https://www.ptt.cc/bbs/Gossiping/M.1688541691.A.91A.html
標題: [問卦] 機車yter是不是被車行跟油商綁架了?
連結: https://www.ptt.cc/bbs/Gossiping/M.1688541728.A.1BD.html
標題: [公告] 八卦板板規(2023.03.01)
連結: https://www.ptt.cc/bbs/Gossiping/M.1677600392.A.D12.html
標題: [協尋] 6/28 國1北苗栗段 早上車禍 行車記錄器


In [5]:
import csv
import datetime
import re
import requests
from bs4 import BeautifulSoup

def scrape_hotboards():
    # 發送GET請求，獲取熱門看板頁面內容
    response = requests.get("https://www.ptt.cc/bbs/hotboards.html")
    soup = BeautifulSoup(response.text, "html.parser")

    # 取得看板列表
    board_list = []
    board_elements = soup.find_all("div", class_="b-ent")
    for board_element in board_elements:
        name_element = board_element.find("div", class_="board-name")
        name = name_element.text.strip()
        board_list.append(name)

    # 設定CSV檔案名稱
    current_date = datetime.date.today().strftime("%Y-%m-%d")
    filename = f"hotboards_{current_date}.csv"

    # 寫入CSV檔案
    with open(filename, "w", encoding="utf-8", newline="") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["看板名稱"])  # 加入欄位名稱行
        writer.writerows([[board] for board in board_list])

def scrape_articles(board, start_date=None, end_date=None):
    # 取得當前日期
    current_date = datetime.date.today().strftime("%Y-%m-%d")

    # 設定CSV檔案名稱
    filename = f"{board}_articles_{current_date}.csv"

    # 建立CSV檔案並寫入標題行
    with open(filename, "w", encoding="utf-8", newline="") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["標題", "發文時間", "作者", "內容"])  # 不包含留言欄位

        # 爬取文章
        page_url = f"https://www.ptt.cc/bbs/{board}/index.html"
        while True:
            # 發送GET請求，獲取看板頁面內容
            response = requests.get(page_url)
            soup = BeautifulSoup(response.text, "html.parser")

            # 取得文章列表
            article_elements = soup.find_all("div", class_="r-ent")
            for article_element in article_elements:
                # 取得文章標題
                title_element = article_element.find("div", class_="title")
                title = title_element.text.strip()

                # 取得文章發文時間、作者和內容
                post_time_element = article_element.find("div", class_="date")
                post_time = post_time_element.text.strip()

                author_element = article_element.find("div", class_="author")
                author = author_element.text.strip()

                # 點擊文章連結進入文章頁面
                link = article_element.find("a")["href"]
                article_url = f"https://www.ptt.cc{link}"
                response = requests.get(article_url)
                article_soup = BeautifulSoup(response.text, "html.parser")

                content_element = article_soup.find("div", id="main-content")
                content = content_element.text.strip()

                # 精簡內容
                content = re.sub(r"※ 發信站:.*", "", content)
                content = re.sub(r"※ 文章網址:.*", "", content)
                content = re.sub(r"※ 編輯:.*", "", content)

                # 寫入CSV檔案
                writer.writerow([title, post_time, author, content])

            # 翻頁
            previous_page = soup.find("a", string="‹ 上頁")
            if previous_page is None:
                break

            page_url = f"https://www.ptt.cc{previous_page['href']}"

            # 檢查日期範圍
            current_date = page_url.split("/")[-1].replace("index", "").replace(".html", "")
            if start_date and end_date:
                if start_date > current_date or current_date > end_date:
                    break

def scrape_comments(board, start_date=None, end_date=None):
    # 取得當前日期
    current_date = datetime.date.today().strftime("%Y-%m-%d")

    # 設定CSV檔案名稱
    filename = f"{board}_comments_{current_date}.csv"

    # 建立CSV檔案並寫入標題行
    with open(filename, "w", encoding="utf-8", newline="") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["標題", "發文時間", "作者", "留言"])  # 只包含留言欄位

        # 爬取留言
        page_url = f"https://www.ptt.cc/bbs/{board}/index.html"
        while True:
            # 發送GET請求，獲取看板頁面內容
            response = requests.get(page_url)
            soup = BeautifulSoup(response.text, "html.parser")

            # 取得文章列表
            article_elements = soup.find_all("div", class_="r-ent")
            for article_element in article_elements:
                # 取得文章標題
                title_element = article_element.find("div", class_="title")
                title = title_element.text.strip()

                # 取得文章發文時間、作者
                post_time_element = article_element.find("div", class_="date")
                post_time = post_time_element.text.strip()

                author_element = article_element.find("div", class_="author")
                author = author_element.text.strip()

                # 點擊文章連結進入文章頁面
                link = article_element.find("a")["href"]
                article_url = f"https://www.ptt.cc{link}"
                response = requests.get(article_url)
                article_soup = BeautifulSoup(response.text, "html.parser")

                # 取得留言列表
                comment_elements = article_soup.find_all("div", class_="push")
                for comment_element in comment_elements:
                    comment = comment_element.find("span", class_="push-content").text.strip()
                    writer.writerow([title, post_time, author, comment])

            # 翻頁
            previous_page = soup.find("a", string="‹ 上頁")
            if previous_page is None:
                break

            page_url = f"https://www.ptt.cc{previous_page['href']}"

            # 檢查日期範圍
            current_date = page_url.split("/")[-1].replace("index", "").replace(".html", "")
            if start_date and end_date:
                if start_date > current_date or current_date > end_date:
                    break

# 執行爬蟲任務
scrape_hotboards()
scrape_articles("Gossiping")
scrape_comments("Gossiping")



In [6]:
print(1+1)

2


In [7]:
git init


SyntaxError: invalid syntax (3277417328.py, line 1)