In [None]:
# accommodations_table (완료)
    # accommodations_id
    # accommodations_main_category
    # accommodations_sub_category
    # accommodations_name

# review_table (완료)
    # accommodations_id
    # accommodations_rating
    # accommodations_review_count


########## 미구현
### 아래처럼 총 2개의 A.csv, B.csv를 만들지?
##### date_type = H인 경우의 price_table_A,
##### date_type = W인 경우의 price_table_B

### 아래처럼 1개의 csv로 만들지?
##### accomm_id: a, date_type: H, price: 20000
##### accomm_id: a, date_type: W, price: 50000
##### accomm_id: b, date_type: H, price: 30000
##### accomm_id: b, date_type: W, price: 70000
# price_table
    # accommodations_id
    # date_type(H/W)
    # price

In [7]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
import json
import re
import os

from config import region_dict, weekday_dates, holiday_dates, category_codes

In [8]:
# 셀레니움 웹드라이버 설정
def setup_driver():
    options = Options()
    options.add_argument('--headless')
    return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)


# URL 생성
def generate_url(base_url, region, checkin, checkout, category, page):
    return f"{base_url}keyword={region}&autoKeyword=&checkIn={checkin}&checkOut={checkout}&personal=2&freeForm=true&category={category}&page={page}"


# URL의 HTML 소스 가져오기
def fetch_page_source(driver, url):
    driver.get(url)
    time.sleep(3)
    return driver.page_source


def get_checkinout_dates(is_weekday=True):
    if is_weekday:
        return weekday_dates[0], weekday_dates[1]  # 평일 체크인/체크아웃
    else:
        return holiday_dates[0], holiday_dates[1]  # 주말 체크인/체크아웃

In [9]:
# HTML에서 숙소 데이터 파싱
def parse_main_page_data(html, category_key):
    soup = BeautifulSoup(html, 'html.parser')
    script_tag = soup.find('script', type='application/ld+json')
    if not script_tag:
        return {}

    json_data = json.loads(script_tag.string)
    item_list = json_data.get('mainEntity', {}).get('itemListElement', [])
    
    accommodations_table = {}
    review_table = {}

    for item in item_list:
        main_page_info = item.get('item')
        if main_page_info is None:
            continue
        
        accomm_id_match = re.search(r'domestic-accommodations/(\d+)', main_page_info.get('url', ''))
        accomm_id = accomm_id_match.group(1) if accomm_id_match else None

        print('accomm id:', accomm_id)

        star_rating = main_page_info.get('starRating', "")
        
        # starRating에서 MainCategory와 SubCategory를 분리
        if category_key in star_rating:
            main_category = category_key  # MainCategory를 category_key로 설정
            sub_category = star_rating.replace(category_key, "").strip()  # category_key를 제거하고 앞의 나머지 부분을 sub_category로 설정
        else:  # main_category가 기재돼 있지 않은 경우,
            print("No main_category data exists")
            break

        if '·' in sub_category:
            # sub_category 문자열을 '·'을 기준으로 분리
            # e.g. "블랙 · 4성급 · " => ["블랙", "4성급", ""] => "블랙", "4성급"
            sub_category_list = [s.strip() for s in sub_category.split('·') if s.strip()]
            # sub_category_list의 요소를 ', '을 이용해 문자열로 결합
            # e.g. "블랙", "4성급" => "블랙, 4성급"
            sub_category = ', '.join(sub_category_list).strip()

        name = main_page_info.get('name', "")

        accommodations_table[accomm_id] = {
            "main_category": main_category,
            "sub_category": sub_category,
            "name": name
        }

        rating_info = main_page_info.get('aggregateRating', {})
        rating_value = rating_info.get('ratingValue', 0) if rating_info else 0
        review_count = rating_info.get('reviewCount', 0) if rating_info else 0

        review_table[accomm_id] = {
            "rating": float(rating_value * 2),
            "review_count": review_count
        }

    return accommodations_table, review_table


# 숙소 크롤링
def crawl_main_page(base_url, checkin, checkout, category, category_key):
    driver = setup_driver()

    accommodations_table = {}
    review_table = {}
    
    try:
        for key, region in region_dict.items():
            page = 1

            # while True:  # 모든 페이지 크롤링
            while page <= 2:  # 1, 2페이지만 크롤링
                url = generate_url(base_url, region, checkin, checkout, category, page)
                html = fetch_page_source(driver, url)
                accommodations, reviews = parse_main_page_data(html, category_key)

                if accommodations is None:
                    print("No script 'application/ld+json' exist")
                    break
                if reviews is None:
                    print("No script 'application/ld+json' exist")
                    break

                accommodations_table.update(accommodations)
                review_table.update(reviews)
                page += 1
    finally:
        driver.quit()
    
    return accommodations_table, review_table

In [10]:
def create_dir_if_not_exists(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)


def create_csv(table, index_name, column_mappings, filename_prefix):
    df = pd.DataFrame.from_dict(table, orient='index')
    df.index.name = index_name
    df.reset_index(inplace=True)
    df.rename(columns=column_mappings, inplace=True)
    
    create_dir_if_not_exists('tables')
    filename = os.path.join('tables', f'{filename_prefix}.csv')
    
    df.to_csv(filename, index=False)
    print(f"저장 완료: {filename}")


def save_accommodations_table_to_csv(accommodations_table):
    column_mappings = {
        'main_category': 'accommodations_main_category',
        'sub_category': 'accommodations_sub_category',
        'name': 'accommodations_name'
    }
    create_csv(accommodations_table, 'accommodations_id', column_mappings, 'accommodations_table')


def save_review_table_to_csv(review_table):
    column_mappings = {
        'rating': 'accommodations_rating',
        'review_count': 'accommodations_review_count'
    }
    create_csv(review_table, 'accommodations_id', column_mappings, 'review_table')


In [11]:
# 메인 코드
base_url = "https://www.yeogi.com/domestic-accommodations?"
category_key = "모텔"

In [12]:
category = category_codes[category_key]
checkin, checkout = get_checkinout_dates(is_weekday=True)  # 평일 가격 구하기
# checkin, checkout = get_checkin_checkout_dates(is_weekday=False)  # 주말 가격 구하기
accommodations_table, review_table = crawl_main_page(base_url, checkin, checkout, category, category_key)

accomm id: 3468
accomm id: 2067
accomm id: 57436
accomm id: 4453
accomm id: 152
accomm id: 46728
accomm id: 66769
accomm id: 66224
accomm id: 56215
accomm id: 170
accomm id: 4691
accomm id: 66620
accomm id: 3164
accomm id: 5467
accomm id: 68413
accomm id: 5203
accomm id: 48986
accomm id: 56291
accomm id: 49868
accomm id: 5680
accomm id: 51941
accomm id: 68563
accomm id: 12546
accomm id: 11858
accomm id: 269
accomm id: 63410
accomm id: 4072
accomm id: 4485
accomm id: 5917
accomm id: 3237
accomm id: 3543
accomm id: 57286
accomm id: 5052
accomm id: 70288
accomm id: 2088
accomm id: 5288
accomm id: 1382
accomm id: 3163
accomm id: 62642
accomm id: 67990


KeyboardInterrupt: 

In [None]:
save_accommodations_table_to_csv(accommodations_table)
save_review_table_to_csv(review_table)

저장 완료: tables/accommodations_table.csv
저장 완료: tables/review_table.csv
