In [15]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver.common.action_chains import ActionChains
from pymongo import MongoClient
from tqdm import tqdm
import time
from datetime import datetime, timedelta
import re
import copy

In [16]:
# MongoDB 클라이언트 및 컬렉션 설정
client = MongoClient(host="", port=, username='', password='')
db = client['BOF2024_crawling']
collection = db['tiktok_crawling']
data_list = list(collection.find({"검색키워드": "BOF 釜山"}, {"_id": 1, "URL": 1}))

In [17]:
data_list1 = copy.deepcopy(data_list)

In [18]:
driver = webdriver.Chrome()

In [19]:
# 오늘 날짜 설정
today = datetime.today()

def parse_date(date_text):
    try:
        if "일 전" in date_text:
            days = int(re.search(r'(\d+)일 전', date_text).group(1))
            date = today - timedelta(days=days)
        elif "주 전" in date_text:
            weeks = int(re.search(r'(\d+)주 전', date_text).group(1))
            date = today - timedelta(weeks=weeks)
        elif "시간 전" in date_text or "분 전" in date_text:
            date = today
        elif '-' in date_text:
            parts = date_text.split('-')
            if len(parts) == 2:  # mm-dd 형식
                month, day = int(parts[0]), int(parts[1])
                date = datetime(today.year, month, day)
            elif len(parts) == 3:  # yyyy-mm-dd 형식
                year, month, day = int(parts[0]), int(parts[1]), int(parts[2])
                date = datetime(year, month, day)
        else:
            date = today  # 기본적으로 오늘 날짜 할당

        return date.strftime('%Y.%m.%d.')
    except:
        return "N/A"

In [20]:
def process_link(url):
    try:
        driver.get(url)
        time.sleep(5)  # 페이지가 로드될 때까지 대기

        # 맨 아래까지 스크롤
        SCROLL_PAUSE_TIME = 1.5
        last_height = driver.execute_script("return document.documentElement.scrollHeight")

        while True:
            driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
            time.sleep(SCROLL_PAUSE_TIME)
            new_height = driver.execute_script("return document.documentElement.scrollHeight;")
            if new_height == last_height:
                break
            last_height = new_height

        time.sleep(5)
        
        # 모든 댓글 요소들을 찾기
        username_elements = driver.find_elements(By.CSS_SELECTOR, "span[data-e2e='comment-username-1']")
        comment_elements = driver.find_elements(By.CSS_SELECTOR, "p[data-e2e='comment-level-1'] span")
        date_elements = driver.find_elements(By.CSS_SELECTOR, "span[data-e2e='comment-time-1']")
        like_elements = driver.find_elements(By.CSS_SELECTOR, "span[data-e2e='comment-like-count']")

        comments = []

        for username_element, comment_element, date_element, like_element in zip(username_elements, comment_elements, date_elements, like_elements):
            try:
                # 사용자 이름
                username = username_element.text.strip()

                # 댓글 내용
                comment_text = comment_element.text.strip()

                # 작성일자
                date_text = date_element.text
                formatted_date = parse_date(date_text)

                # 댓글 좋아요
                likes = int(like_element.text)

                # 사용자 이름이나 댓글 내용이 공백이 아닌 경우에만 추가
                if username and comment_text:
                    comments.append({
                        "사용자 이름": username,
                        "댓글 내용": comment_text,
                        "작성일자": formatted_date,
                        "댓글 좋아요": likes
                    })

            except Exception as e:
                print(f"Error processing comment: {e}")

        # 댓글 개수
        comment_count = len(comments)

        # MongoDB 업데이트
        collection.update_one({"URL": url}, {"$set": {
            "댓글 개수": comment_count,
            "댓글": comments
        }})

    except Exception as e:
        print(f"Error processing link {url}: {e}")


In [None]:
for data in tqdm(data_list1, desc="링크 처리 중", leave=False):
    url = data['URL']
    process_link(url)

client.close()
driver.quit()