In [33]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from pymongo import MongoClient
from tqdm import tqdm
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import time
import urllib.request
import datetime
import copy
import re

In [34]:
# MongoDB 클라이언트 및 컬렉션 설정
client = MongoClient(host="", port=, username='', password='')
db = client['BOF2024_crawling']
collection = db['naver_tv_crawling']
data_list = list(collection.find({"검색키워드" : "Busan One Asia Festival"}, {"_id": 1, "URL": 1}))

In [35]:
data_list1 = copy.deepcopy(data_list)

In [36]:
driver = webdriver.Chrome()

In [37]:
# 좋아요 개수를 변환 (ex) 1.3만 → 13000 형식으로 변환)
def convert_like_count(like_count_text):
    like_count_text = like_count_text.replace(',', '')  # 쉼표 제거
    try:
        if '만' in like_count_text:
            count = float(like_count_text.replace('만', '')) * 10000
        else:
            count = float(like_count_text)
        return int(count)
    except ValueError:
        return 0  # 숫자로 변환할 수 없는 경우 0을 반환

In [38]:
# 공연일자 연도 추출
def extract_airing_year(airing_date_text):
    year_match = re.search(r'^(\d{4})', airing_date_text)
    if year_match:
        return year_match.group(1)
    return ""

In [39]:
def process_link(url):
    try:
        driver.get(url)

        time.sleep(3)

        # 더보기 버튼 요소 찾기
        try:
            more_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'button.ArticleSection_button_open__mTDBA')))
            more_button.click()
            time.sleep(3)
        except TimeoutException:
             print("더보기 버튼 없음. 계속 진행합니다.")

        # 맨 아래까지 스크롤
        SCROLL_PAUSE_TIME = 1.5
        last_height = driver.execute_script("return document.documentElement.scrollHeight")

        while True:
            driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
            time.sleep(SCROLL_PAUSE_TIME)
            new_height = driver.execute_script("return document.documentElement.scrollHeight;")
            if new_height == last_height:
                break
            last_height = new_height

        # 좋아요 개수 요소 찾기
        try:
            like_count_elem = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.ArticleSection_button_item__fvSDs .ArticleSection_text__Hdvc7')))
            like_count_text = like_count_elem.text.strip()
    
            # 좋아요가 "좋아요"인 경우
            if like_count_text == "좋아요":
                like_count = 0
            else:
                like_count = convert_like_count(like_count_text)
        except NoSuchElementException:
            like_count = 0  # 좋아요 요소가 없는 경우 0을 할당
            like_count_text = "좋아요 요소를 찾을 수 없음"

        # 업로드 날짜 요소 찾기
        try:
            date_elem = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.ArticleSection_list_item__q6zOX > span:last-child')))
            date_text = date_elem.text.strip() + '.'
        except NoSuchElementException:
            date_text = ""  # 업로드 날짜 요소가 없는 경우 공백으로 할당

        # 공연일자 요소 찾기
        try:
            airing_date_elem = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//span[text()='방영일']/following-sibling::span")))
            airing_date_text = airing_date_elem.text.strip()
            airing_year = extract_airing_year(airing_date_text)
        except NoSuchElementException:
            airing_year = ""  # 공연일자 요소가 없는 경우 공백으로 할당
        except TimeoutException:
            airing_year = ""  # TimeoutException 발생 시 공백으로 할당
            
         # Mongodb에 적재
        collection.update_one({"URL": url}, {"$set": {
            "좋아요": like_count,
            "업로드 날짜": date_text,
            "공연일자": airing_year
        }})

    except Exception as e:
        print("URL 처리 중 오류 발생:", e)

In [40]:
for data in tqdm(data_list1, desc="Processing Links", unit="link"):
    url = data['URL']
    process_link(url)

client.close()
driver.quit()

Processing Links:   8%|▊         | 1/13 [00:18<03:38, 18.23s/link]

더보기 버튼 없음. 계속 진행합니다.


Processing Links:  23%|██▎       | 3/13 [01:01<03:23, 20.34s/link]

더보기 버튼 없음. 계속 진행합니다.


Processing Links:  46%|████▌     | 6/13 [02:02<02:18, 19.78s/link]

더보기 버튼 없음. 계속 진행합니다.


Processing Links:  54%|█████▍    | 7/13 [02:27<02:09, 21.52s/link]

더보기 버튼 없음. 계속 진행합니다.


Processing Links: 100%|██████████| 13/13 [04:30<00:00, 20.78s/link]
