## 네이버 리뷰 크롤링 version 12
- 제작자: SCL 디지털기획팀 최영부  
- 일자: 2025. 02. 04 
- 기존버전 대비 변경사항 : 
  1) 네이버 플레이스 리뷰의 구조 변경에 따라 코드 수정  
  2) 화면배율 50%로 축소(options.add_argument('--force-device-scale-factor=0.5')) : 일부 하단 "테마리스트"때문에 "더보기" 버튼이 화면에 보이지 않아 클릭할 수 없는 현상 수정 
  3) 사용자가 지정한 날짜(user_specified_date) 이후부터 최근까지의 데이터를 수집함 
  4) 전체 시간단축을 위해 페이지 펼침 동작을 최대 횟수를 사용자가 지정함 (max_scrolls)    
  5) WebDriverWait을 사용하여 버튼 클릭을 기다리고, scrollIntoView()를 사용하여 버튼이 화면에 나타나도록 스크롤하며, 클릭할 수 있을 때까지 재시도

In [None]:
#!pip install selenium
#!pip install bs4
#!pip install requests
#!pip install urllib3
#!pip install webdriver_manager
#!pip install openpyxl
#!pip install xlrd
#!pip install datetime
#!pip install lxml
#!pip install pandas

In [1]:
## 필요한 패키지 로딩 
import time
start_time = time.time() 
import datetime
import requests
import selenium
import lxml
import re
import warnings 
warnings.filterwarnings('ignore')

from selenium.webdriver.common.by import By
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
from openpyxl import Workbook
from bs4 import BeautifulSoup
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from requests.packages.urllib3.util.retry import Retry

In [5]:
# DRM 파일 열기 
import xlwings as xw
import pandas as pd

## DRM 파일인 경우 
book = xw.Book('data/store_list.xlsx')
sheet = book.sheets[0]    

df = sheet.used_range.options(pd.DataFrame, index = False).value
print(df.shape)
df.tail()

(40, 5)


Unnamed: 0,no,brand,store_type,store_name,store_url_naver
35,36.0,Chai797,CD,Chai797 스타필드 수원점,https://m.place.naver.com/restaurant/145004355...
36,37.0,Chai797,CD,Chai797 롯데백화점 포항점,https://m.place.naver.com/restaurant/162294287...
37,38.0,Chai797,CD,Chai797 신세계센텀시티점,https://m.place.naver.com/restaurant/180353596...
38,39.0,Chai797,CD,Chai797 롯데백화점 동탄점,https://m.place.naver.com/restaurant/140301219...
39,,,,,


---
##### user_specified_date : 사용자가 특정 날짜를 지정하면, 해당 일자 이후의 데이터만 가져오게 된다. (반드시 "yyyy-mm-dd"형태로 정확한 날짜를 입력할 것) 
##### max_scrolls : 페이지 펼침 횟수 지정, 한 페이지에 10개의 리뷰가 등록되므로 max_scrolls = 10이라면 최근 100개의 리뷰를 가져오게 된다. 
---

In [6]:
################################### 데이터 수집 시작일자, 페이지 펼침 수 지정 ################
user_specified_date = "2025-02-01"
max_scrolls = 10
###########################################################################################

user_date = datetime.datetime.strptime(user_specified_date, '%Y-%m-%d')   ## 사용자 지정 날짜의 타입변경 

In [7]:
# 날짜 처리 함수 정의
def parse_review_date(date_text, default_year):
    try:
        # 날짜 형식이 "1.1.수"와 같은 경우
        if len(date_text.split('.')) == 3:
            month, day, weekday = date_text.split('.')
            month, day = int(month), int(day)
            year = default_year
        # 날짜 형식이 "24.12.28.토"와 같은 경우
        else:
            year, month, day, weekday = date_text.split('.')
            year, month, day = int(year), int(month), int(day)
        return datetime.datetime(year, month, day)
    except ValueError as e:
        print(f"Error parsing date: {date_text}, error: {e}")
    return None


# Assumption: df and user_date variables are defined before running this loop
for x in range(len(df['store_name'])):
    url = df['store_url_naver'][x]
    s_brand = df['brand'][x]
    s_type = df['store_type'][x]
    s_store = df['store_name'][x]

    # Webdriver headless mode setting
    options = webdriver.ChromeOptions()
    options.add_argument('window-size=1920x1080')
    options.add_argument('--force-device-scale-factor=0.4')
    options.add_argument("disable-gpu")
    # options.add_argument("--headless")  # ❗ 수정됨: 백그라운드에서 실행

    # BS4 setting for secondary access
    session = requests.Session()
    retries = Retry(total=5, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504])
    session.mount('http://', HTTPAdapter(max_retries=retries))

    # Prepare Excel file
    now = datetime.datetime.now()
    review_data = []

    # Start crawling/scraping!
    try:
        driver = webdriver.Chrome(options=options)
        driver.get(url)
        driver.implicitly_wait(5)

        # Handle iframe if present
        try:
            iframe = WebDriverWait(driver, 5).until(
                EC.presence_of_element_located((By.TAG_NAME, "iframe"))
            )
            driver.switch_to.frame(iframe)
        except Exception:
            print(f"No iframe found for {s_store}")

        # Scroll and click "펼침" buttons
        scroll_count = 0

        while scroll_count < max_scrolls:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2)

            try:
                more_button = WebDriverWait(driver, 3).until(
                    EC.element_to_be_clickable((By.XPATH, '//*[@id="app-root"]/div/div/div/div[6]/div[2]/div[3]/div[2]/div/a/span'))
                )
                more_button.click()
                time.sleep(2)
            except Exception:
                print(f"No more content to load for {s_store}")
                break

            scroll_count += 1

        # Extract reviews
        html = driver.page_source
        bs = BeautifulSoup(html, 'lxml')
        reviews = bs.select('li.place_apply_pui.EjjAW')

        for r in reviews:
            brand = s_brand
            store_type = s_type
            store_name = s_store

            # 네이버 리뷰 페이지의 구조는 비 정기적으로 변경됨, 아래 항목들의 현재 구조를 확인한 후 필요시 변경사항 반영/수정 필요            
            nickname = r.select_one('span.pui__NMi-Dp')
            content = r.select_one('div.pui__vn15t2 a[role="button"]')
            review_date_element = r.select_one('time[aria-hidden="true"]')
            revisit = r.select_one('div.pui__QKE5Pr > span:nth-of-type(2)')
            reply_date_element = r.select_one('div.pui__MzrP-X > span.pui__4APmFd > time[aria-hidden="true"]')
            reply_txt_element = r.select_one('div.pui__J0tczd > a[role="button"], div.pui__J0tczd > span[data-pui-click-code="text"]')

            # Handle "더보기" button
            more_button = r.select_one('div.lfH3O > a.fvwqf')

            if more_button:
                try:
                    driver.execute_script("arguments[0].click();", more_button)
                    time.sleep(2)
                except Exception:
                    print(f"Error clicking '더보기' button for {s_store}")

            # Extract text or set default
            nickname_text = nickname.text.strip() if nickname else 'N/A'
            content_text = content.text.strip() if content else 'N/A'
            review_date_text = review_date_element.text.strip() if review_date_element else 'N/A'
            revisit_text = revisit.text.strip() if revisit else 'N/A'
            reply_date_text = reply_date_element.text.strip() if reply_date_element else 'N/A'
            reply_txt_text = reply_txt_element.text.strip() if reply_txt_element else 'N/A'

            # Convert dates
            review_date = parse_review_date(review_date_text, now.year)
            replied_date = parse_review_date(reply_date_text, now.year)

            # Filter reviews by date
            if review_date and review_date >= user_date:
                review_data.append([brand, store_type, store_name, review_date, nickname_text, content_text, revisit_text, reply_txt_text, replied_date])

        # Save to Excel
        df_output = pd.DataFrame(review_data, columns=['brand', 'type', 'store', 'date', 'nickname', 'content', 'revisit', 'reply_txt', 'reply_date'])
        df_output.to_excel(f'output/naver_review_{s_store}.xlsx', index=False)
        print(f"{x} store finished successfully.")

    except Exception as e:
        print(f"Error processing store {s_store}: {e}")

    finally:
        driver.quit()

No iframe found for Chai797 Plus 파미에스테이션점
Error parsing date: N/A, error: not enough values to unpack (expected 4, got 1)
Error parsing date: N/A, error: not enough values to unpack (expected 4, got 1)
0 store finished successfully.
No iframe found for Chai797 Plus 아이파크몰 고척
Error parsing date: N/A, error: not enough values to unpack (expected 4, got 1)
Error parsing date: N/A, error: not enough values to unpack (expected 4, got 1)
Error parsing date: N/A, error: not enough values to unpack (expected 4, got 1)
Error parsing date: N/A, error: not enough values to unpack (expected 4, got 1)
Error parsing date: N/A, error: not enough values to unpack (expected 4, got 1)
Error parsing date: N/A, error: not enough values to unpack (expected 4, got 1)
Error parsing date: N/A, error: not enough values to unpack (expected 4, got 1)
Error parsing date: N/A, error: not enough values to unpack (expected 4, got 1)
Error parsing date: N/A, error: not enough values to unpack (expected 4, got 1)
Error

In [8]:
end_time = time.time()
execution_time = (end_time - start_time)/60
print("Execution time:", execution_time, "min")

Execution time: 98.2288357535998 min


In [9]:

## output 폴더내 생성된 모든 (매장별)엑셀파일을 하나로 합치기 
 
import glob
import datetime as dt

try:
    path = 'output/'
    files = glob.glob(path + "*.xlsx")
    excel = pd.DataFrame()
    for file_name in files:
        df = pd.read_excel(file_name, sheet_name='Sheet1')
        #excel = excel.append(df, ignore_index=True)        ## pandas 2.xx 버전 이후에서는 append가 작동하지 않음 
        excel = pd.concat([excel, df], ignore_index=True)
    #print(excel)
    excel.to_excel('concated_file/naver_review_all.xlsx', index=False)
    
except Exception as ex:
    print('error' + str(ex))