## 네이버 리뷰 크롤링 version 6.1
- 제작자: SCL 디지털기획팀 최영부  
- 일자: 2024. 08.05 
- 기존버전 대비 변경사항 : 
  1) 네이버 플레이스 리뷰의 구조 변경에 따라 코드 수정  
  2) 화면배율 50%로 축소(options.add_argument('--force-device-scale-factor=0.5')) 
     ~ 몇몇 페이지에서 최하단 "테마리스트"때문에 "더보기" 버튼이 화면에 보이지 않아 클릭할 수 없는 현상 수정 
  3) 전체 시간단축을 위해 페이지 펼침 동작을 최대 60번만 하도록 함 (최근 600개의 리뷰만 가져옴)
     ~ 정육점/서리재는 5번, 차이797은 20번, 호우섬은 60번 펼침이 적당한 듯 
  4) 데이터 수집항목 추가 : 대댓글 작성일자 및 내용 

In [1]:
#!pip install selenium
#!pip install bs4
#!pip install requests
#!pip install urllib3
#!pip install webdriver_manager
#!pip install openpyxl
#!pip install xlrd
#!pip install datetime
#!pip install lxml
#!pip install pandas

In [2]:
#!pip install time 
import time
start_time = time.time() 

In [3]:
## 필요한 패키지 로딩 
import time
import datetime
import requests
import selenium
import lxml
import re
import warnings 
warnings.filterwarnings('ignore')

from selenium.webdriver.common.by import By
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
from openpyxl import Workbook
from bs4 import BeautifulSoup
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys

In [4]:
## 매장명, url 리스트 로딩 
import pandas as pd 
df = pd.read_excel('data/store_list.xlsx')
print(df.shape)
df.tail()

(15, 5)


Unnamed: 0,no,brand,store_type,store_name,store_url_naver
10,67,호우섬,CD,호우섬 현대천호점,https://m.place.naver.com/restaurant/110679555...
11,68,호우섬,CD,호우섬 롯데수원점,https://m.place.naver.com/restaurant/174571964...
12,69,호우섬,UCD,살롱드호우섬 신세계 강남점,https://m.place.naver.com/restaurant/198700504...
13,70,호우섬,UCD,살롱드호우섬 디타워 서울포레스트점,https://m.place.naver.com/restaurant/180354151...
14,71,호우섬,UCD,살롱드호우섬 현대목동점,https://m.place.naver.com/restaurant/165229957...


---
##### - 사용자가 특정 날짜를 지정하면, 해당 일자 이후의 데이터만 가져오게 된다. 
##### - user_specified_date = "2024-01-01"과 같이, 반드시 "yyyy-mm-dd"형태로 정확한 날짜를 입력할 것 
---

In [5]:
# User-defined date in YYYY-MM-DD format : 사용자가 지정한 날짜 이후의 데이터만 가져온다. 
user_specified_date = "2024-07-01"

# Convert the user-specified date to a datetime object
user_date = datetime.datetime.strptime(user_specified_date, '%Y-%m-%d')

In [6]:
import time
import datetime
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from openpyxl import Workbook

# Assumption: df, user_date variables are defined before running this loop
for x in range(len(df['store_name'])): 
    url = df['store_url_naver'][x]
    s_brand = df['brand'][x]
    s_type = df['store_type'][x]
    s_store = df['store_name'][x]
    
    # Webdriver headless mode setting
    options = webdriver.ChromeOptions()
    options.add_argument('window-size=1920x1080')
    options.add_argument('--force-device-scale-factor=0.5') #화면 배율 50%로, 마지막까지 scroll down된 상태에서 "더보기"가 보이도록
    options.add_argument("disable-gpu")

    # BS4 setting for secondary access
    session = requests.Session()
    headers = {
        "User-Agent": "user value"}

    retries = Retry(total=5,
                    backoff_factor=0.1,
                    status_forcelist=[500, 502, 503, 504])

    session.mount('http://', HTTPAdapter(max_retries=retries))

    # New xlsx file
    now = datetime.datetime.now()
    xlsx = Workbook()
    list_sheet = xlsx.create_sheet('output')    
    list_sheet.append(['brand', 'type','store', 'date', 'nickname', 'content', 'revisit', 'reply_txt', 'reply_date'])

    # Start crawling/scraping!
    try:        
        driver = webdriver.Chrome(options=options)        
        driver.get(url)
        driver.implicitly_wait(5)

        # 최근 n x 10개의 리뷰만 가져오도록 (접힌 페이지를 n번만 펼치도록)
        scroll_count = 0
        while scroll_count < 60:  
            driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.PAGE_DOWN)
            time.sleep(1)  # 기존의 5초에서 1초로 변경
            scroll_count += 1

        try:
            for _ in range(60):  # "펼침" 버튼을 최대 n번 클릭
                driver.find_element(By.XPATH, '//*[@id="app-root"]/div/div/div/div[6]/div[2]/div[3]/div[2]/div/a/span').click()
                time.sleep(1)  # 기존의 5초에서 1초로 변경
        except Exception as e:
            print(str(e) + ' ' + 'store_started')
        

        time.sleep(5)
        html = driver.page_source
        bs = BeautifulSoup(html, 'lxml')
        reviews = bs.select('li.owAeM') # 네이버의 변경부분 반영 

        for r in reviews:             
            brand = s_brand
            store_type = s_type
            store_name = s_store
            
            # 해당 매장의 검색결과 화면에서, iframe내 값들을 확인 (네이버의 변경부분 반영)
            nickname = r.select_one('div.qgLL3>span.P9EZi')
            content = r.select_one('div.vg7Fp.CyA_N span.zPfVt')
            review_date_element = r.select_one('div.D40bm > span:nth-child(1) > span:nth-child(3)')
            revisit = r.select('div.D40bm>span.CKUdu')[1]
            
            # 주의! 점장의 대댓글은 7월부터 등록하기 시작함, 즉 모든 리뷰에 존재하지 않음            
            reply_date_element = r.select_one('div.Es4CD > span.cXRZU > time[aria-hidden="true"]')
            reply_txt_element = r.select_one('div.Aijox div.Qy2RA')

            # exception handling
            nickname = nickname.text if nickname else ''
            content = content.text if content else ''
            review_date = review_date_element.text if review_date_element else ''
            revisit = revisit.text if revisit else ''            

            reply_date = reply_date_element.text if reply_date_element else ''            
            reply_txt = reply_txt_element.text if reply_txt_element else ''
            
            # Convert the review date to a datetime object
            match = re.search(r'(\d{4})년 (\d{1,2})월 (\d{1,2})일', review_date)
            if match:
                year, month, day = map(int, match.groups())
                formatted_review_date = f'{year:04d}-{month:02d}-{day:02d}'
                review_date = datetime.datetime.strptime(formatted_review_date, '%Y-%m-%d')
            else:
                review_date = None

            # Convert the reply date to a datetime object
            if reply_date:
                match_2 = re.search(r'(\d{1,2})\.(\d{1,2})\.\w', reply_date)
                if match_2:
                    month_2, day_2 = map(int, match_2.groups())
                    year_2 = 2024  # 연도는 고정된 2024년
                    formatted_date_2 = f'{year_2:04d}-{month_2:02d}-{day_2:02d}' 
                    replied_date = datetime.datetime.strptime(formatted_date_2, '%Y-%m-%d')
                else:
                    replied_date = None
            else:
                replied_date = None    

            # Check if the review date is after the user-specified date
            if review_date and review_date >= user_date:
                list_sheet.append([brand, store_type, store_name, review_date, nickname, content, revisit, reply_txt, replied_date])
                time.sleep(1)  # 기존의 5초에서 1초로 변경
            
        # Save the file
        file_name = 'naver_review_' + str(s_store) + '.xlsx'
        xlsx.save('output/' + file_name)
        print( str(x) +' '+'store_finish')

    except Exception as e:
        print(e)
        # Save the file(temp)
        file_name = 'naver_review_' + str(s_store) + '.xlsx'
        xlsx.save('output/' + file_name)
        print( str(x) +' '+'store_finish')


0 store_finish
Message: no such element: Unable to locate element: {"method":"xpath","selector":"//*[@id="app-root"]/div/div/div/div[6]/div[2]/div[3]/div[2]/div/a/span"}
  (Session info: chrome=127.0.6533.89); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x00007FF63F6D9632+30946]
	(No symbol) [0x00007FF63F68E3C9]
	(No symbol) [0x00007FF63F586FDA]
	(No symbol) [0x00007FF63F5D822C]
	(No symbol) [0x00007FF63F5D850C]
	(No symbol) [0x00007FF63F61DCB7]
	(No symbol) [0x00007FF63F5FCAAF]
	(No symbol) [0x00007FF63F61B041]
	(No symbol) [0x00007FF63F5FC813]
	(No symbol) [0x00007FF63F5CA6E5]
	(No symbol) [0x00007FF63F5CB021]
	GetHandleVerifier [0x00007FF63F80F83D+1301229]
	GetHandleVerifier [0x00007FF63F81BDB7+1351783]
	GetHandleVerifier [0x00007FF63F812A03+1313971]
	GetHandleVerifier [0x00007FF63F70DD06+245686]
	(No symbol) [0x00007FF63F69758F]
	(No symbol) [0x00007F

In [7]:
end_time = time.time()
execution_time = (end_time - start_time)/60
print("Execution time:", execution_time, "min")

Execution time: 134.44943492412568 min


In [9]:
'''
## output 폴더내 생성된 모든 (매장별)엑셀파일을 하나로 합치기 
 
import glob
import datetime as dt

try:
    path = 'output/'
    files = glob.glob(path + "*.xlsx")
    excel = pd.DataFrame()
    for file_name in files:
        df = pd.read_excel(file_name, sheet_name='output')
        #excel = excel.append(df, ignore_index=True)        ## pandas 2.xx 버전 이후에서는 append가 작동하지 않음 
        excel = pd.concat([excel, df], ignore_index=True)
    #print(excel)
    excel.to_excel('concated_file/naver_review_all.xlsx', index=False)
    
except Exception as ex:
    print('error' + str(ex))

'''