In [46]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from pytube import YouTube
from pymongo import MongoClient
from tqdm import tqdm
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import TimeoutException
import time
import urllib.request
import datetime
import copy

In [47]:
# MongoDB 클라이언트 및 컬렉션 설정
client = MongoClient(host="", port=, username='', password='')
db = client['YouTube_crawling']
collection = db['OneAsia_YouTube_crawling_2024']
data_list = list(collection.find({"URL": "https://www.youtube.com/watch?v=mj2ICcslnbU&pp=ygUe67aA7IKw7JuQ7JWE7Iuc7JWE7Y6Y7Iqk7Yuw67KM"}, {"_id": 1, "URL": 1}))

In [49]:
data_list1 = copy.deepcopy(data_list)

In [50]:
# 날짜 데이터 변환 (유튜브 댓글은 정확한 작성일자를 알 수 없음, 임의로 지정 ex) 1일전 ~ 13일전, 2주전 ~ 3주전... 등 오늘 날짜에서 해당 날짜를 (-), 시간, 분, 초의 경우 오늘 날짜를 할당)
def convert_published_time(published_time):
    current_date = datetime.datetime.now()

    if "주" in published_time:
        time_unit = int(published_time.split("주")[0])
        delta = current_date - datetime.timedelta(weeks=time_unit)
    elif "개월" in published_time:
        time_unit = int(published_time.split("개월")[0])
        delta = current_date - datetime.timedelta(days=30 * time_unit)
    elif "년" in published_time:
        time_unit = int(published_time.split("년")[0])
        delta = current_date - datetime.timedelta(days=365 * time_unit)
    elif "시간" in published_time:
        time_unit = int(published_time.split("시간")[0])
        delta = current_date - datetime.timedelta(hours=time_unit)
    elif "분" in published_time:
        time_unit = int(published_time.split("분")[0])
        delta = current_date - datetime.timedelta(minutes=time_unit)
    elif "초" in published_time:
        return current_date.strftime("%Y-%m-%d")
    else:
        # 1일부터 13일 전까지는 해당 일 수를 현재 날짜에서 (-)
        if "일" in published_time:
            time_unit = int(published_time.split("일")[0])
            delta = current_date - datetime.timedelta(days=time_unit)
            return delta.strftime("%Y-%m-%d")

    return delta.strftime("%Y-%m-%d")

In [51]:
# 좋아요 개수를 변환 (ex) 1.3천 → 1300, 1.3만 → 13000 형식으로 변환)
def convert_like_count(like_count_text):
    if not like_count_text:
        return 0
    
    if '천' in like_count_text:
        multiplier = 1000
        like_count_text = like_count_text.replace('천', '').strip()
        like_count = int(float(like_count_text) * multiplier)
    elif '만' in like_count_text:
        multiplier = 10000
        like_count_text = like_count_text.replace('만', '').strip()
        like_count = int(float(like_count_text) * multiplier)
    else:
        like_count = int(float(like_count_text))
    return like_count

In [52]:
driver = webdriver.Chrome()

In [53]:
def process_link(url):
    driver.get(url)

    # 맨 아래까지 스크롤
    SCROLL_PAUSE_TIME = 1.5
    last_height = driver.execute_script("return document.documentElement.scrollHeight")

    while True:
        driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
        time.sleep(SCROLL_PAUSE_TIME)
        new_height = driver.execute_script("return document.documentElement.scrollHeight;")
        if new_height == last_height:
            break
        last_height = new_height

    time.sleep(3)

    comments = []

    # 댓글 요소 찾기
    try:
        comment_threads = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'ytd-comment-thread-renderer')))
    except TimeoutException:
        # 요소가 나타나지 않을 경우 댓글 개수를 0으로 설정하고 빈 리스트를 할당
        comment_count = 0
        comments = []
    else:
        comment_count = 0
        for comment_thread in tqdm(comment_threads, desc="댓글 수집 중"):
            # 사용자 이름 가져오기
            user_name = ""
            try:
                user_name_element = comment_thread.find_element(By.CSS_SELECTOR, 'a#author-text > span')
                user_name = user_name_element.text.strip()
            except NoSuchElementException:
                pass

            # 댓글 내용 가져오기
            comment_text = ""
            try:
                comment_text_element = comment_thread.find_element(By.CSS_SELECTOR, 'yt-attributed-string#content-text > span')
                comment_text = comment_text_element.text.strip()
            except NoSuchElementException:
                pass

            # 댓글 내용과 사용자 이름이 모두 공백이 아닌 경우에만 추가 (text로 변경 불가능한 이모티콘만 댓글에 작성된 경우 가져오지 않음 → 실제 댓글 개수와 차이날 수 있음)
            if user_name.strip() and comment_text.strip():
                # 작성일자 요소 찾기
                try:
                    published_time_element = comment_thread.find_element(By.CSS_SELECTOR, 'span#published-time-text')
                    published_time = published_time_element.text.strip()
                    published_time_formatted = convert_published_time(published_time)
                except NoSuchElementException:
                    published_time_formatted = ""

                # 댓글 좋아요 요소 찾기
                try:
                    like_count_element = comment_thread.find_element(By.CSS_SELECTOR, 'span#vote-count-middle')
                    like_count_text = like_count_element.get_attribute("innerText").strip()
                    like_count = convert_like_count(like_count_text)
                except NoSuchElementException:
                    like_count = 0

                comments.append({"사용자 이름": user_name, "댓글 내용": comment_text, "작성일자": published_time_formatted, "댓글 좋아요": like_count})
                comment_count += 1

        print(f"댓글 개수: {comment_count}")

    collection.update_one({"URL": url}, {"$set": {
        "댓글 개수": comment_count,
        "댓글": comments
    }})

In [54]:
for data in tqdm(data_list1, desc="링크 처리 중", leave=False):
    url = data['URL']
    process_link(url)

client.close()
driver.quit()

댓글 수집 중: 100%|██████████| 30/30 [00:00<00:00, 34.18it/s]
링크 처리 중:  50%|█████     | 1/2 [00:11<00:11, 11.53s/it]

댓글 개수: 30


댓글 수집 중: 100%|██████████| 30/30 [00:00<00:00, 34.38it/s]
                                                           

댓글 개수: 30
