## 네이버 리뷰 크롤링 
- 제작자: SCL 디지털기획팀 최영부  
- 일자: 2023. 9.30 
- 참고자료: https://jinooh.tistory.com/89

In [1]:
#!pip install selenium
#!pip install bs4
#!pip install requests
#!pip install urllib3
#!pip install webdriver_manager
#!pip install openpyxl
#!pip install xlrd
#!pip install datetime
#!pip install lxml

In [2]:
## 필요한 패키지 로딩 
import time
import datetime
import requests
import selenium
import lxml
import warnings 
warnings.filterwarnings('ignore')

from selenium.webdriver.common.by import By
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
from openpyxl import Workbook
from bs4 import BeautifulSoup
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys

In [3]:
## 매장명, url 리스트 로딩 
import pandas as pd 
df = pd.read_excel('data/store_list.xlsx')
print(df.shape)
df.head()

(2, 5)


Unnamed: 0,no,brand,store_type,store_name,store_url_naver
0,18,Chai797,JUCD,Chai797 잠실롯데월드몰점,https://m.place.naver.com/restaurant/193423122...
1,50,호우섬,CD,호우섬 청량리역점,https://m.place.naver.com/restaurant/191734975...


In [4]:
for x in range(len(df['store_name'])): 
    url = df['store_url_naver'][x]
    s_brand = df['brand'][x]
    s_type = df['store_type'][x]
    s_store = df['store_name'][x]
    
    # Webdriver headless mode setting
    options = webdriver.ChromeOptions()
    #options.add_argument('headless')
    options.add_argument('window-size=1920x1080')
    options.add_argument("disable-gpu")

    # BS4 setting for secondary access
    session = requests.Session()
    headers = {
        "User-Agent": "user value"}

    retries = Retry(total=5,
                    backoff_factor=0.1,
                    status_forcelist=[500, 502, 503, 504])

    session.mount('http://', HTTPAdapter(max_retries=retries))

    # New xlsx file
    now = datetime.datetime.now()
    xlsx = Workbook()
    list_sheet = xlsx.create_sheet('output')
    list_sheet.append(['brand', 'type','store', 'date', 'nickname', 'content', 'revisit'])

    # Start crawling/scraping!
    try:
        #driver = webdriver.Chrome(ChromeDriverManager().install(), chrome_options=options)        
        driver = webdriver.Chrome()        
        res = driver.get(url)
        driver.implicitly_wait(10)

        # Pagedown 
        '''
        # 이 부분을 활성화 시키면 전체 리뷰를 가져온다. (비활성화시 첫페이지 10개 리뷰만 가져옴)
        driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.PAGE_DOWN)

        try:
            while True:
                driver.find_element(By.XPATH, '//*[@id="app-root"]/div/div/div/div[7]/div[2]/div[3]/div[2]/a').click()
                time.sleep(0.8)
        except Exception as e:
            print( str(x) +' '+'store_started')
        '''

        time.sleep(20)
        html = driver.page_source
        bs = BeautifulSoup(html, 'lxml')
        reviews = bs.select('li.YeINN')

        for r in reviews:             
            brand = s_brand
            store_type = s_type
            store_name = s_store
            
            ## 해당 매장의 검색결과 화면에서, iframe내 값들을 확인 
            nickname = r.select_one('div.VYGLG')
            content = r.select_one('div.ZZ4OK.IwhtZ')    
            date = r.select('span.tzZTd>span.place_blind')[1]
            revisit = r.select('div._7kR3e>span.tzZTd')[1]

            # exception handling
            nickname = nickname.text if nickname else ''
            content = content.text if content else ''
            date = date.text if date else ''
            revisit = revisit.text if revisit else ''
            time.sleep(5)
            
            list_sheet.append([brand, store_type, store_name, date, nickname, content, revisit])            
            time.sleep(5)
        # Save the file
        #file_name = 'naver_review_' + str(s_store) + '_' + now.strftime('%Y-%m-%d_%H-%M-%S') + '.xlsx'
        file_name = 'naver_review_'+ str(s_store) + '.xlsx'
        xlsx.save('output/' + file_name)
        print( str(x) +' '+'store_finish')

    except Exception as e:
        print(e)
        # Save the file(temp)
        file_name = 'naver_review_'+ str(s_store) + '.xlsx'
        xlsx.save('output/' + file_name)
        print( str(x) +' '+'store_finish')
    

0 store_finish
1 store_finish


In [5]:
## output 폴더내 생성된 모든 엑셀파일을 하나로 합치기 
#!pip install openpyxl
#!pip install xlrd
 
import glob
import datetime as dt

try:
    path = 'output/'
    files = glob.glob(path + "*.xlsx")
    excel = pd.DataFrame()
    for file_name in files:
        df = pd.read_excel(file_name, sheet_name='output')
        #excel = excel.append(df, ignore_index=True)        ## pandas 2.xx 버전 이후에서는 append가 작동하지 않음 
        excel = pd.concat([excel, df], ignore_index=True)
    #print(excel)
    excel.to_excel('concated_file/naver_review_all.xlsx', index=False)
    
except Exception as ex:
    print('error' + str(ex))
    
### 날짜(2023년 10월 4일 수요일)를 timestamp 형태(yyyy-mm-dd)로 변환하여 다시 저장 
df = pd.read_excel('concated_file/naver_review_all.xlsx')

df['date'] = df['date'].replace('년 ','-', regex=True).replace('월 ', '-', regex=True).replace('일 ', '-', regex=True).str[:-4]
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d', errors='raise')

df.to_excel('concated_file/naver_review_all.xlsx', index=False)