In [41]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from pymongo import MongoClient
from tqdm import tqdm
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, ElementClickInterceptedException
import time
import urllib.request
import datetime
import copy
import re

In [42]:
# MongoDB 클라이언트 및 컬렉션 설정
client = MongoClient(host="", port=, username='', password='')
db = client['BOF2024_crawling']
collection = db['naver_tv_crawling']
data_list = list(collection.find({"URL" : "https://tv.naver.com/v/2294808"}, {"_id": 1, "URL": 1}))

In [43]:
data_list1 = copy.deepcopy(data_list)

In [44]:
driver = webdriver.Chrome()

In [45]:
# 좋아요 개수를 변환 (ex) 1.3만 → 13000 형식으로 변환)
def convert_like_count(like_text):
    like_text = like_text.replace(',', '')  # 쉼표 제거
    try:
        if '만' in like_text:
            count = float(like_text.replace('만', '')) * 10000
        else:
            count = float(like_text)
        return int(count)
    except ValueError:
        return 0  # 숫자로 변환할 수 없는 경우 0을 반환

In [46]:
# 날짜 데이터 변환 함수 수정
def convert_published_time(published_time):
    current_date = datetime.datetime.now()

    if "일" in published_time:
        time_unit = int(published_time.split("일")[0])
        delta = current_date - datetime.timedelta(days=time_unit)
    elif "주" in published_time:
        time_unit = int(published_time.split("주")[0])
        delta = current_date - datetime.timedelta(weeks=time_unit)
    elif "개월" in published_time:
        time_unit = int(published_time.split("개월")[0])
        delta = current_date - datetime.timedelta(days=30 * time_unit)
    elif "년" in published_time:
        time_unit = int(published_time.split("년")[0])
        delta = current_date - datetime.timedelta(days=365 * time_unit)
    else:
        delta = current_date

    return delta.strftime("%Y.%m.%d.")

In [47]:
def process_link(url):
    try:
        driver.get(url)

        time.sleep(3)

        # 재생 버튼을 클릭해 일시정지 → 비디오 길이가 짧아 자동으로 다음 비디오가 재생됨을 방지함 (다음 비디오가 재생되면 오류 발생)
        def click_video_play_button():
            try:
                play_button = driver.find_element(By.CSS_SELECTOR, "button.pzp-button.pzp-playback-switch.pzp-pc-playback-switch.pzp-pc__playback-switch")
                play_button.click()
                return True
            except (NoSuchElementException, ElementClickInterceptedException):
                return False

        # 비디오 재생 버튼 클릭
        if click_video_play_button():
            time.sleep(3)  # 클릭 후 대기 시간
            
        # 맨 아래까지 스크롤
        SCROLL_PAUSE_TIME = 1.5
        last_height = driver.execute_script("return document.documentElement.scrollHeight")

        while True:
            driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
            time.sleep(SCROLL_PAUSE_TIME)
            new_height = driver.execute_script("return document.documentElement.scrollHeight;")
            if new_height == last_height:
                break
            last_height = new_height

        time.sleep(5)

        # 사용자 이름 요소 찾기
        usernames = driver.find_elements(By.CSS_SELECTOR, "span.CommentListItem_nickname__Jd3gD")
        usernames_text = [username.text for username in usernames]

        # 댓글 내용 요소 찾기
        comments = driver.find_elements(By.CSS_SELECTOR, "p.CommentListItem_article__8wfwu")
        comments_text = []
        for comment in comments:
            text = comment.text.strip()
            if text:  # 공백이 아닌 경우에만 추가
                comments_text.append(text)

        # 작성일자 요소 찾기
        dates = driver.find_elements(By.CSS_SELECTOR, "span.CommentListItem_time__MMnF3")
        dates_text = [date.text.replace('등록일', '').strip() for date in dates]
        dates_text = [convert_published_time(date) for date in dates_text]

        # 좋아요 개수 요소 찾기
        try:
            like_buttons = driver.find_elements(By.XPATH, "//div[@class='CommentListItem_reaction_area__SY7fm']//button[.//span[text()='좋아요']]//span[@class='CommentListItem_text__M77fF']")
            likes_text = [like.text for like in like_buttons]

            likes = []
            for like_text in likes_text:
                if like_text == "좋아요":
                    likes.append(0)
                else:
                    likes.append(convert_like_count(like_text))

        except NoSuchElementException:
            likes = [0] * len(usernames)  # 좋아요 요소가 없는 경우 0을 할당

         # MongoDB에 적재
        comments_data = []
        for i in range(len(usernames_text)):
            comment_data = {
                "사용자 이름": usernames_text[i],
                "댓글 내용": comments_text[i],
                "작성일자": dates_text[i],
                "댓글 좋아요": likes[i]
            }
            comments_data.append(comment_data)

        collection.update_one({"URL": url}, {"$set": {
            "댓글": comments_data,  
            "댓글 개수": len(comments_data)  
        }})

        time.sleep(3)

    except Exception as e:
        print("URL 처리 중 오류 발생:", e)

In [48]:
for data in tqdm(data_list1, desc="Processing Links", unit="link"):
    url = data['URL']
    process_link(url)

client.close()
driver.quit()

Processing Links: 100%|██████████| 1/1 [00:16<00:00, 16.05s/link]
