In [None]:
import time
from selenium import webdriver
import csv
import pandas as pd
from bs4 import BeautifulSoup as bs
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException


class NaverCafeCrawler:
    def __init__(self, driver_path, url, id, pw, baseurl, clubid, userDisplay, boardType):
        self.total_list = ['제목', '내용', '링크']
        self.driver_path = driver_path
        self.url = url
        self.id = id
        self.pw = pw
        self.baseurl = baseurl
        self.baseraw = "https://cafe.naver.com"
        self.clubid = clubid
        self.userDisplay = userDisplay
        self.boardType = boardType

    def initialize_file(self, file_path='crawl.csv'):
        with open(file_path, 'w', encoding='utf-8', newline='') as f:
            wr = csv.writer(f)
            wr.writerow([self.total_list[0], self.total_list[1], self.total_list[2]])

    def login(self, browser):
        browser.get(self.url)
        browser.implicitly_wait(2)
        browser.execute_script(f"document.getElementsByName('id')[0].value='{self.id}'")
        browser.execute_script(f"document.getElementsByName('pw')[0].value='{self.pw}'")
        browser.find_element(By.XPATH, '//*[@id="log.login"]').click()
        time.sleep(1)

    def crawl_page(self, browser, page_num):
        browser.get(f"{self.baseurl}ArticleList.nhn?search.clubid={self.clubid}&userDisplay={self.userDisplay}"
                    f"&search.boardType={self.boardType}&search.page={page_num}")
        browser.switch_to.frame('cafe_main')
        soup = bs(browser.page_source, 'html.parser')
        soup = soup.find_all(class_='article-board m-tcol-c')[1]
        datas = soup.find_all(class_='td_article')
        new_df = pd.DataFrame(columns=['제목', '내용', '링크'])
        for data in datas:
            article_title = data.find(class_='article')
            link = article_title.get('href')
            article_title = article_title.get_text().strip()
            content = self.get_content(browser, self.baseraw + link)
            new_df = pd.concat([new_df, pd.DataFrame({'제목': [article_title], '내용': [content], '링크': [self.baseraw + link]})],
                              ignore_index=True)
        return new_df

    def get_content(self, browser, link):
        browser.get(link)
        time.sleep(1)
        browser.switch_to.frame('cafe_main')
        soup = bs(browser.page_source, 'html.parser')

        content_div = soup.find("div", {"class": "article_viewer"})
        if content_div:
            content = content_div.get_text().strip()
        else:
            content = ""

        comments_div = soup.find("div", {"class": "comment_text_box"})
        if comments_div:
            comments = []
            for comment in comments_div.find_all("span", {"class": "text_comment"}):
                comment_text = comment.get_text().strip()
                nested_replies = self.extract_replies(comment)
                comment_text += "\nReplies:\n" + nested_replies
                comments.append(comment_text)

            comments = "\n\n".join(comments)
            print(comments)
            content += "\n\nComments:\n" + comments

        return content

    def extract_replies(self, comment):
        replies = []
        nested_replies_div = comment.find_next("div", {"class": "comment_text_box"})
        while nested_replies_div:
            reply_text = nested_replies_div.find("span", {"class": "text_comment"}).get_text().strip()
            nested_replies = self.extract_replies(nested_replies_div)
            reply_text += "\nReplies:\n" + nested_replies
            replies.append(reply_text)
            nested_replies_div = nested_replies_div.find_next("div", {"class": "comment_text_box"})

        return "\n\n".join(replies)

    def run(self, max_pages=2, file_path='crawl.csv'):
        self.initialize_file(file_path)
        i = 0
        while i < max_pages:
            print(f"Processing page {i}....")
            i += 1
            pageNum = i
            original_df = pd.read_csv(file_path, encoding='utf-8')
            chrome_options = Options()
            chrome_options.add_argument('--headless')
            chrome_options.add_argument(f'--webdriver-path={self.driver_path}')
            browser = webdriver.Chrome(options=chrome_options)
            self.login(browser)
            new_df = self.crawl_page(browser, pageNum)
            concat_df = pd.concat([original_df, new_df])
            concat_df = concat_df.drop_duplicates(keep=False)
            concat_df.to_csv(file_path, mode='a', header=False, index=False)
            browser.close()
        print("done completely....")

if __name__ == "__main__":
    driver_path = "Chrome-Driver-Path 지정"   #### 사용자 정의
    url = 'https://nid.naver.com/nidlogin.login'
    id = "네이버ID" #### 사용자 정의
    pw = "네이버패스워드" #### 사용자 정의
    baseurl = "https://cafe.naver.com/"
    clubid = 카페ID지정 #### 사용자 정의
    userDisplay = 50
    boardType = 'L'

    crawler = NaverCafeCrawler(driver_path, url, id, pw, baseurl, clubid, userDisplay, boardType)
    crawler.run(max_pages=1) #### 사용자 정의 - 페이지 크기
    df = pd.read_csv("crawl.csv")
