## 네이버 리뷰 크롤링 version 9 
- 제작자: SCL 디지털기획팀 최영부  
- 일자: 2024. 10. 03 
- 기존버전 대비 변경사항 : 
  1) 네이버 플레이스 리뷰의 구조 변경에 따라 코드 수정  
  2) 화면배율 50%로 축소(options.add_argument('--force-device-scale-factor=0.5')) 
     ~ 몇몇 페이지에서 최하단 "테마리스트"때문에 "더보기" 버튼이 화면에 보이지 않아 클릭할 수 없는 현상 수정 
  3) 전체 시간단축을 위해 페이지 펼침 동작을 최대 60번만 하도록 함 (최근 600개의 리뷰만 가져옴)
     ~ 정육점/서리재는 5번, 차이797은 20번, 호우섬은 60번 펼침이 적당한 듯 
  4) 데이터 수집항목 추가 : 대댓글 작성일자 및 내용 
  5) WebDriverWait을 사용하여 버튼 클릭을 기다리고, scrollIntoView()를 사용하여 버튼이 화면에 나타나도록 스크롤하며, 클릭할 수 있을 때까지 재시도
  6) ** 댓글 내용(reply_txt) 데이터 가져오기 계속 실패 중 ... 

In [1]:
#!pip install selenium
#!pip install bs4
#!pip install requests
#!pip install urllib3
#!pip install webdriver_manager
#!pip install openpyxl
#!pip install xlrd
#!pip install datetime
#!pip install lxml
#!pip install pandas

In [2]:
#!pip install time 
import time
start_time = time.time() 

In [3]:
## 필요한 패키지 로딩 
import time
import datetime
import requests
import selenium
import lxml
import re
import warnings 
warnings.filterwarnings('ignore')

from selenium.webdriver.common.by import By
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
from openpyxl import Workbook
from bs4 import BeautifulSoup
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys

In [4]:
## 매장명, url 리스트 로딩 
import pandas as pd 
df = pd.read_excel('data/store_list.xlsx')
print(df.shape)
df.tail()

(39, 5)


Unnamed: 0,no,brand,store_type,store_name,store_url_naver
34,35,Chai797,CD,Chai797 Plus 롯데부산본점,https://m.place.naver.com/restaurant/110382196...
35,62,Chai797,CD,Chai797 스타필드 수원점,https://m.place.naver.com/restaurant/145004355...
36,63,Chai797,CD,Chai797 롯데백화점 포항점,https://m.place.naver.com/restaurant/162294287...
37,64,Chai797,CD,Chai797 신세계센텀시티점,https://m.place.naver.com/restaurant/180353596...
38,65,Chai797,CD,Chai797 롯데백화점 동탄점,https://m.place.naver.com/restaurant/140301219...


---
##### - 사용자가 특정 날짜를 지정하면, 해당 일자 이후의 데이터만 가져오게 된다. 
##### - user_specified_date = "2024-01-01"과 같이, 반드시 "yyyy-mm-dd"형태로 정확한 날짜를 입력할 것 
---

In [5]:
# User-defined date in YYYY-MM-DD format : 사용자가 지정한 날짜 이후의 데이터만 가져온다. 
user_specified_date = "2024-09-01"

# Convert the user-specified date to a datetime object
user_date = datetime.datetime.strptime(user_specified_date, '%Y-%m-%d')

In [6]:
import time
import datetime
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from openpyxl import Workbook

# Assumption: df, user_date variables are defined before running this loop
for x in range(len(df['store_name'])):
    url = df['store_url_naver'][x]
    s_brand = df['brand'][x]
    s_type = df['store_type'][x]
    s_store = df['store_name'][x]

    # Webdriver headless mode setting
    options = webdriver.ChromeOptions()
    options.add_argument('window-size=1920x1080')
    options.add_argument('--force-device-scale-factor=0.4')  # 화면 배율 50%로, 마지막까지 scroll down된 상태에서 "더보기"가 보이도록
    options.add_argument("disable-gpu")

    # BS4 setting for secondary access
    session = requests.Session()
    headers = {
        "User-Agent": "user value"}

    retries = Retry(total=5,
                    backoff_factor=0.1,
                    status_forcelist=[500, 502, 503, 504])

    session.mount('http://', HTTPAdapter(max_retries=retries))

    # New xlsx file
    now = datetime.datetime.now()
    xlsx = Workbook()
    list_sheet = xlsx.create_sheet('output')
    list_sheet.append(['brand', 'type', 'store', 'date', 'nickname', 'content', 'revisit', 'reply_txt', 'reply_date'])

    # Start crawling/scraping!
    try:
        driver = webdriver.Chrome(options=options)
        driver.get(url)
        driver.implicitly_wait(5)

        # 최근 n x 10개의 리뷰만 가져오도록 (접힌 페이지를 n번만 펼치도록)
        scroll_count = 0
        while scroll_count < 15:
            driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.PAGE_DOWN)
            time.sleep(2)  # 페이지 다운 후 2초 대기
            scroll_count += 1

        # Scroll to and click "펼침" 버튼
        for _ in range(15):  # "펼침" 버튼을 최대 n번 클릭
            try:
                element = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.XPATH, '//*[@id="app-root"]/div/div/div/div[6]/div[2]/div[3]/div[2]/div/a/span'))
                )
                driver.execute_script("arguments[0].scrollIntoView();", element)
                element.click()
                time.sleep(2)  # 클릭 후 2초 대기
            except Exception as e:
                print(f"Error clicking button: {str(e)}")
                time.sleep(2)  # 여유 시간 추가

        time.sleep(5)
        html = driver.page_source
        bs = BeautifulSoup(html, 'lxml')
        reviews = bs.select('li.pui__X35jYm.EjjAW')

        for r in reviews:
            brand = s_brand
            store_type = s_type
            store_name = s_store

            # 해당 매장의 검색결과 화면에서, iframe내 값들을 확인 (네이버의 변경부분 반영)
            nickname = r.select_one('div.pui__JiVbY3 > span.pui__uslU0d')
            content = r.select_one('div.pui__vn15t2 > a.pui__xtsQN-')
            review_date_element = r.select_one('div.pui__QKE5Pr > span.pui__gfuUIT > time[aria-hidden="true"]')
            revisit = r.select_one('div.pui__QKE5Pr > span:nth-of-type(2)')

            # 주의! 점장의 대댓글은 7월부터 등록하기 시작함, 즉 모든 리뷰에 존재하지 않음
            reply_date_element = r.select_one('div.pui__MzrP-X > span.pui__4APmFd > time[aria-hidden="true"]')
            
            # Try to click "더보기" button if exists
            more_button = r.select_one('div.pui__J0tczd > a.pui__wFzIYl')
            if more_button:
                try:
                    driver.execute_script("arguments[0].click();", more_button)  # click() 대신 execute_script를 통해 클릭
                    time.sleep(2)  # 추가 콘텐츠 로딩 대기
                except Exception as e:
                    print(f"Error clicking '더보기' button: {str(e)}")
            else:
                print("No '더보기' button found.")
            
            
            reply_txt_element = r.select_one('div.pui__J0tczd > a.pui__xtsQN-')

            # exception handling
            nickname_text = nickname.text if nickname else ''
            content_text = content.text if content else ''
            review_date_text = review_date_element.text if review_date_element else ''
            revisit_text = revisit.text if revisit else ''

            reply_date_text = reply_date_element.text if reply_date_element else ''
            reply_txt_text = reply_txt_element.text if reply_txt_element else ''

            
            # Convert the review date to a datetime object
            match = re.search(r'(\d{1,2})\.(\d{1,2})\.\w', review_date_text)
            if match:
                month, day = map(int, match.groups())
                year = 2024  # 연도는 고정된 2024년
    
            # 날짜가 유효한지 확인
                try:
                    formatted_review_date = f'{year:04d}-{month:02d}-{day:02d}'
                    review_date = datetime.datetime.strptime(formatted_review_date, '%Y-%m-%d')
                except ValueError:
                    print(f"Invalid date found: {formatted_review_date}")
                    review_date = None
            else:
                review_date = None


            # Convert the reply date to a datetime object
            if reply_date_text:
                match_2 = re.search(r'(\d{1,2})\.(\d{1,2})\.\w', reply_date_text)
                if match_2:
                    month_2, day_2 = map(int, match_2.groups())
                    year_2 = 2024  # 연도는 고정된 2024년
                    formatted_date_2 = f'{year_2:04d}-{month_2:02d}-{day_2:02d}'
                    replied_date = datetime.datetime.strptime(formatted_date_2, '%Y-%m-%d')
                else:
                    replied_date = None
            else:
                replied_date = None

            # Check if the review date is after the user-specified date
            if review_date and review_date >= user_date:
                list_sheet.append([brand, store_type, store_name, review_date, nickname_text, content_text, revisit_text, reply_txt_text, replied_date])
                time.sleep(2)  # 각 리뷰 처리 후 2초 대기

        # Save the file
        file_name = 'naver_review_' + str(s_store) + '.xlsx'
        xlsx.save('output/' + file_name)
        print(str(x) + ' ' + 'store_finish')

    except Exception as e:
        print(e)
        # Save the file(temp)
        file_name = 'naver_review_' + str(s_store) + '.xlsx'
        xlsx.save('output/' + file_name)
        print(str(x) + ' ' + 'store_finish')


No '더보기' button found.
No '더보기' button found.
No '더보기' button found.
No '더보기' button found.
No '더보기' button found.
No '더보기' button found.
No '더보기' button found.
No '더보기' button found.
No '더보기' button found.
No '더보기' button found.
No '더보기' button found.
No '더보기' button found.
No '더보기' button found.
No '더보기' button found.
No '더보기' button found.
No '더보기' button found.
No '더보기' button found.
No '더보기' button found.
No '더보기' button found.
No '더보기' button found.
No '더보기' button found.
No '더보기' button found.
No '더보기' button found.
No '더보기' button found.
No '더보기' button found.
No '더보기' button found.
No '더보기' button found.
Error clicking '더보기' button: Object of type Tag is not JSON serializable
Error clicking '더보기' button: Object of type Tag is not JSON serializable
Error clicking '더보기' button: Object of type Tag is not JSON serializable
Error clicking '더보기' button: Object of type Tag is not JSON serializable
Error clicking '더보기' button: Object of type Tag is not JSON serializable
Error clicking

In [7]:
end_time = time.time()
execution_time = (end_time - start_time)/60
print("Execution time:", execution_time, "min")

Execution time: 75.36671993335088 min


In [9]:
'''
## output 폴더내 생성된 모든 (매장별)엑셀파일을 하나로 합치기 
 
import glob
import datetime as dt

try:
    path = 'output/'
    files = glob.glob(path + "*.xlsx")
    excel = pd.DataFrame()
    for file_name in files:
        df = pd.read_excel(file_name, sheet_name='output')
        #excel = excel.append(df, ignore_index=True)        ## pandas 2.xx 버전 이후에서는 append가 작동하지 않음 
        excel = pd.concat([excel, df], ignore_index=True)
    #print(excel)
    excel.to_excel('concated_file/naver_review_all.xlsx', index=False)
    
except Exception as ex:
    print('error' + str(ex))
'''
