# 캐치테이블 리뷰 수집기 (최종)

이 노트북은 **검증된 셀렉터와 로직**을 사용하여 전체 식당에 대해 순차적으로 작업을 수행합니다.

**주요 로직:**
1. `캐치테이블_가게정보.csv` 로드
2. 각 식당의 현재 리뷰 개수 확인 (`review_count_history.csv`와 비교)
3. 리뷰 개수 변동 시, 상세 페이지(`sortingFilter=D`)로 진입하여 크롤링
4. 스크롤 방식: **마지막 카드 포커스 -> PageDown x2**
5. 결과물: `reviews_collected_YYYYMMDD.csv` 및 히스토리 업데이트

In [1]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import datetime
import os
import re

In [2]:
# ==========================================
# 설정
# ==========================================
INPUT_FILE = "캐치테이블_가게정보.csv"
HISTORY_FILE = "review_count_history.csv"
OUTPUT_FILE = f"reviews_collected_{datetime.datetime.now().strftime('%Y%m%d')}.csv"
CUTOFF_DATE = datetime.datetime(2025, 12, 9)

def get_driver():
    options = Options()
    options.add_argument("--window-size=1600,1000")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(options=options)
    return driver

def load_history():
    if os.path.exists(HISTORY_FILE):
        return pd.read_csv(HISTORY_FILE)
    else:
        return pd.DataFrame(columns=['url', 'restaurant_name', 'review_count', 'last_updated'])

def save_history(df):
    df.to_csv(HISTORY_FILE, index=False, encoding='utf-8-sig')

def modify_url_for_reviews(url):
    if "/review" in url:
        if "sortingFilter=D" not in url:
             return url + "&sortingFilter=D"
        return url
    if "?" in url:
        base, query = url.split("?", 1)
        if base.endswith("/"): base = base[:-1]
        new_url = f"{base}/review?{query}&sortingFilter=D"
    else:
        if url.endswith("/"): url = url[:-1]
        new_url = f"{url}/review?sortingFilter=D"
    return new_url

def parse_date(date_str):
    today = datetime.datetime.now()
    try:
        if "일 전" in date_str:
            days = int(re.search(r'(\d+)', date_str).group(1))
            return today - datetime.timedelta(days=days)
        elif "시간 전" in date_str or "분 전" in date_str or "방금" in date_str:
            return today
        elif "어제" in date_str:
            return today - datetime.timedelta(days=1)
        else:
            return datetime.datetime.strptime(date_str, "%Y.%m.%d")
    except:
        return today

In [3]:
def scrape_reviews(driver, url, restaurant_name):
    review_url = modify_url_for_reviews(url)
    driver.get(review_url)
    time.sleep(5)
    
    collected_reviews = []
    processed_hashes = set()
    
    scrolling = True
    scroll_count = 0
    max_scrolls = 50 # 충분히 넉넉하게 설정
    
    while scrolling and scroll_count < max_scrolls:
        try:
            # 검증된 카드 셀렉터
            cards = driver.find_elements(By.CSS_SELECTOR, "#main > div.container.gutter-sm > div > div > div > div")
        except:
            cards = []
            
        # 첫 시도 시 카드 없으면 로딩 대기 1회 추가
        if not cards and scroll_count == 0:
            time.sleep(3)
            try:
                cards = driver.find_elements(By.CSS_SELECTOR, "#main > div.container.gutter-sm > div > div > div > div")
            except:
                 pass
        
        found_in_this_scroll = False
        
        for card in cards:
            try:
                # 4가지 상세 정보 (User-Verified Selectors)
                try:
                    reviewer = card.find_element(By.CSS_SELECTOR, "article > div.__header > div.__user-info > a > h4 > span").text
                except:
                    reviewer = "Unknown"

                try:
                    rating_el = card.find_element(By.CSS_SELECTOR, "article > div.__header > div.__review-meta.__review-meta--with-rating > div > a > div")
                    rating = rating_el.text
                except:
                    rating = "Unknown"

                try:
                    date_el = card.find_element(By.CSS_SELECTOR, "article > div.__header > div.__review-meta.__review-meta--with-rating > span")
                    date_text = date_el.text
                except:
                    # 백업 텍스트 파싱
                    match = re.search(r'\d{4}\.\d{1,2}\.\d{1,2}|\d+일 전', card.text)
                    date_text = match.group(0) if match else "Unknown"

                # 날짜 처리
                if date_text != "Unknown":
                    review_date = parse_date(date_text)
                    if review_date < CUTOFF_DATE:
                        scrolling = False # 기준일 이전 도달
                        break

                try:
                    day_night = card.find_element(By.CSS_SELECTOR, "article > div.__header > div.__review-meta.__review-meta--with-rating > div > p").text
                except:
                    day_night = "Unknown"

                # 저장
                msg_hash = hash(f"{reviewer}_{date_text}_{restaurant_name}")
                if msg_hash not in processed_hashes:
                    processed_hashes.add(msg_hash)
                    collected_reviews.append({
                        "restaurant": restaurant_name,
                        "reviewer": reviewer,
                        "review_date": date_text,
                        "reviewer_rating": rating,
                        "day_night": day_night
                    })
                    found_in_this_scroll = True
            except:
                continue
        
        if not scrolling:
            break
            
        # 스크롤 동작
        try:
            if cards:
                last_card = cards[-1]
                actions = ActionChains(driver)
                actions.move_to_element(last_card).perform()
                time.sleep(0.5)
                actions.send_keys(Keys.PAGE_DOWN).pause(0.5).send_keys(Keys.PAGE_DOWN).perform()
                time.sleep(1.5)
            else:
                driver.find_element(By.TAG_NAME, "body").send_keys(Keys.PAGE_DOWN)
                time.sleep(1)
        except:
             driver.find_element(By.TAG_NAME, "body").send_keys(Keys.PAGE_DOWN)
        
        scroll_count += 1
        
    return collected_reviews

In [None]:
# [메인 실행]

if os.path.exists(INPUT_FILE):
    df = pd.read_csv(INPUT_FILE)
    history_df = load_history()
    
    driver = get_driver()
    
    all_collected_data = []
    history_updates = []
    
    print(f"총 {len(df)}개 식당 처리 시작...")
    
    try:
        for idx, row in df.iterrows():
            url = row['URL']
            
            # CSV에서 가게명 가져오기 (1순위)
            restaurant_name = row['restaurant'] if 'restaurant' in row else str(idx)
            
            try:
                # 1. 메인 접속
                driver.get(url)
                time.sleep(3)
                
                # 2. 리뷰 개수 확인 (User XPath)
                try:
                    count_el = driver.find_element(By.XPATH, '//*[@id="wrapperDiv"]/div[1]/div[1]/div[3]/div/span[3]')
                    count_text = count_el.text
                    # "리뷰 268개" -> 268 추출
                    review_count = int(re.search(r'(\d+)', count_text.replace(',', '')).group(1))
                except:
                    # 실패 시 -1 등으로 처리 or 0
                    review_count = 0
                
                print(f"[{idx+1}/{len(df)}] {restaurant_name}: {review_count}개", end=" ")
                
                # 3. 변경 감지 로직
                prev_record = history_df[history_df['url'] == url]
                need_crawl = False
                
                if prev_record.empty:
                    print("-> [신규]", end=" ")
                    need_crawl = True
                else:
                    last_count = int(prev_record.iloc[0]['review_count'])
                    if last_count != review_count:
                        print(f"-> [변동: {last_count}->{review_count}]", end=" ")
                        # 리뷰 개수가 0개면 제외할수도 있으나 일단 진행
                        if review_count > 0: need_crawl = True
                    else:
                        print("-> [변동없음]", end=" ")
                
                # 히스토리 업데이트용 데이터
                history_updates.append({
                    'url': url,
                    'restaurant_name': restaurant_name,
                    'review_count': review_count,
                    'last_updated': datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                })
                
                # 4. 크롤링 수행
                if need_crawl:
                    print("-> 수집 시작")
                    reviews = scrape_reviews(driver, url, restaurant_name)
                    if reviews:
                        all_collected_data.extend(reviews)
                        print(f"   >>> {len(reviews)}건 수집 완료")
                    else:
                        print("   >>> 수집된 리뷰 없음")
                else:
                    print("") # 줄바꿈
                    
            except Exception as e:
                print(f"\n   !!! 에러 발생: {e}")

    finally:
        driver.quit()
        
    # 결과 저장
    if all_collected_data:
        result_df = pd.DataFrame(all_collected_data)
        result_df.to_csv(OUTPUT_FILE, index=False, encoding='utf-8-sig')
        print(f"\n전체 결과 저장 완료: {OUTPUT_FILE} ({len(result_df)}건)")
    else:
        print("\n새로 수집된 리뷰가 없습니다.")
        
    # 히스토리 저장
    if history_updates:
        new_history = pd.DataFrame(history_updates)
        if not history_df.empty:
            # 기존 URL 제거하고 업데이트
            history_df = history_df[~history_df['url'].isin(new_history['url'])]
            final_history = pd.concat([history_df, new_history], ignore_index=True)
        else:
            final_history = new_history
        save_history(final_history)
        print("히스토리 업데이트 완료")
    
else:
    print("입력 CSV 파일을 찾을 수 없습니다.")

총 118개 식당 처리 시작...
[1/118] 유용욱 바베큐 연구소: 272개 -> [변동없음] 
[2/118] IMOK Smoke Dining: 1505개 -> [변동: 1488->1505] -> 수집 시작


review_count_history : 히스토리데이터
reviews_collected_20260114 : 리뷰데이터