## 네이버 리뷰 크롤링 version 3.0
- 제작자: SCL 디지털기획팀 최영부  
- 일자: 2024. 05.28 
- 기존버전 대비 변경사항 : 
  1) 네이버 플레이스 리뷰의 구조 변경에 따라 코드 수정  
  2) 화면배율 50%로 축소(options.add_argument('--force-device-scale-factor=0.5')) 
     ~ 몇몇 페이지에서 최하단 "테마리스트"때문에 "더보기" 버튼이 화면에 보이지 않아 클릭할 수 없는 현상 수정 

In [None]:
#!pip install selenium
#!pip install bs4
#!pip install requests
#!pip install urllib3
#!pip install webdriver_manager
#!pip install openpyxl
#!pip install xlrd
#!pip install datetime
#!pip install lxml
#!pip install pandas

In [None]:
## 필요한 패키지 로딩 
import time
import datetime
import requests
import selenium
import lxml
import re
import warnings 
warnings.filterwarnings('ignore')

from selenium.webdriver.common.by import By
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
from openpyxl import Workbook
from bs4 import BeautifulSoup
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys

In [None]:
## 매장명, url 리스트 로딩 
import pandas as pd 
df = pd.read_excel('data/store_list.xlsx')
print(df.shape)
df.head()

---
##### - 사용자가 특정 날짜를 지정하면, 해당 일자 이후의 데이터만 가져오게 된다. 
##### - user_specified_date = "2024-01-01"과 같이, 반드시 "yyyy-mm-dd"형태로 정확한 날짜를 입력할 것 
---

In [None]:
# User-defined date in YYYY-MM-DD format : 사용자가 지정한 날짜 이후의 데이터만 가져온다. 
user_specified_date = "2023-10-19"

# Convert the user-specified date to a datetime object
user_date = datetime.datetime.strptime(user_specified_date, '%Y-%m-%d')

In [None]:
for x in range(len(df['store_name'])): 
    url = df['store_url_naver'][x]
    s_brand = df['brand'][x]
    s_type = df['store_type'][x]
    s_store = df['store_name'][x]
    
    # Webdriver headless mode setting
    options = webdriver.ChromeOptions()
    #options.add_argument('headless')
    options.add_argument('window-size=1920x1080')
    options.add_argument('--force-device-scale-factor=0.5') #화면 배율 50%로, 마지막까지 scroll down된 상태에서 "더보기"가 보이도록
    options.add_argument("disable-gpu")

    # BS4 setting for secondary access
    session = requests.Session()
    headers = {
        "User-Agent": "user value"}

    retries = Retry(total=5,
                    backoff_factor=0.1,
                    status_forcelist=[500, 502, 503, 504])

    session.mount('http://', HTTPAdapter(max_retries=retries))

    # New xlsx file
    now = datetime.datetime.now()
    xlsx = Workbook()
    list_sheet = xlsx.create_sheet('output')
    list_sheet.append(['brand', 'type','store', 'date', 'nickname', 'content', 'revisit'])

    # Start crawling/scraping!
    try:        
        driver = webdriver.Chrome(options=options)        
        res = driver.get(url)
        driver.implicitly_wait(5)

        while True: 
            driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.PAGE_DOWN)
            time.sleep(1) 
            end_of_page = driver.execute_script("return window.scrollY + window.innerHeight >= document.body.scrollHeight")
            if end_of_page: 
                break 

        try:
            while True:                
                driver.find_element(By.XPATH, '//*[@id="app-root"]/div/div/div/div[6]/div[2]/div[3]/div[2]/div/a/span').click()
                time.sleep(5)
        except Exception as e:
            print( str(x) +' '+'store_started')
        

        time.sleep(10)
        html = driver.page_source
        bs = BeautifulSoup(html, 'lxml')
        reviews = bs.select('li.owAeM') #네이버의 변경부분 반영 

        for r in reviews:             
            brand = s_brand
            store_type = s_type
            store_name = s_store
            
            ## 해당 매장의 검색결과 화면에서, iframe내 값들을 확인 (네이버의 변경부분 반영)
            nickname = r.select_one('div.qgLL3>span.P9EZi')
            content = r.select_one('div.vg7Fp.CyA_N span.zPfVt')
            date = r.select('span.CKUdu>span.place_blind')[1]
            revisit = r.select('div.D40bm>span.CKUdu')[1]

            # exception handling
            nickname = nickname.text if nickname else ''
            content = content.text if content else ''
            date = date.text if date else ''
            revisit = revisit.text if revisit else ''
            
            # Convert the review date to a datetime object
            match = re.search(r'(\d{4})년 (\d{1,2})월 (\d{1,2})일', date)
            year, month, day = map(int, match.groups())
            formatted_date = f'{year:04d}-{month:02d}-{day:02d}'
            review_date = datetime.datetime.strptime(formatted_date, '%Y-%m-%d')

            # Check if the review date is after the user-specified date
            if review_date >= user_date:
                list_sheet.append([brand, store_type, store_name, review_date, nickname, content, revisit])
                time.sleep(5)
            
        # Save the file
        # file_name = 'naver_review_' + str(s_store) + '_' + now.strftime('%Y-%m-%d_%H-%M-%S') + '.xlsx'
        file_name = 'naver_review_'+ str(s_store) + '.xlsx'
        xlsx.save('output/' + file_name)
        print( str(x) +' '+'store_finish')

    except Exception as e:
        print(e)
        # Save the file(temp)
        file_name = 'naver_review_'+ str(s_store) + '.xlsx'
        xlsx.save('output/' + file_name)
        print( str(x) +' '+'store_finish')     

In [None]:
## output 폴더내 생성된 모든 (매장별)엑셀파일을 하나로 합치기 
 
import glob
import datetime as dt

try:
    path = 'output/'
    files = glob.glob(path + "*.xlsx")
    excel = pd.DataFrame()
    for file_name in files:
        df = pd.read_excel(file_name, sheet_name='output')
        #excel = excel.append(df, ignore_index=True)        ## pandas 2.xx 버전 이후에서는 append가 작동하지 않음 
        excel = pd.concat([excel, df], ignore_index=True)
    #print(excel)
    excel.to_excel('concated_file/naver_review_all.xlsx', index=False)
    
except Exception as ex:
    print('error' + str(ex))