In [19]:
# location_table
    # accommodations_id
    # Location_major
    # Location_middle
    # Location_sub

# motel_facilities_table
    # accommodations_id
    # accommodations_main_category
    # 부대시설들 여러 갠데 boolean 타입으로 한다~
    # e.g. exist_WIFI = False

In [38]:
# accommodations_id, accommodations_main_category는 모두 accommodations_table.csv에서 가져올 수 있음

import pandas as pd
import os
import re
import time
import json
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import pyarrow

from config import motel_facilities_dict

In [39]:
# 셀레니움 웹드라이버 설정
def setup_driver():
    options = Options()
    options.add_argument('--headless')
    return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)


# URL 생성
def generate_url(base_url, accomm_id):
    return f"{base_url}{accomm_id}"


# URL의 HTML 소스 가져오기
def fetch_page_source(driver, url):
    driver.get(url)
    time.sleep(3)
    return driver.page_source

In [40]:
def parse_detail_page_data(html, accomm_id, index_num, accomm_main_categories):
    location_table = {}
    motel_facilities_table = {}
    motel_facilities_table[accomm_id] = {'accomm_main_category': accomm_main_categories[index_num]}

    soup = BeautifulSoup(html, 'html.parser')
    script_tag = soup.find('script', type='application/ld+json')
    if not script_tag:
        print(f"No script tag found for accommodation ID: {accomm_id}")
        return {}, {}

    json_string = script_tag.string
    
    if json_string.endswith('&gt;-') or json_string.endswith('&gt;'):
        json_string += '"}]}}'

    # if not json_string.endswith('}'):
    #     json_string += '"'
    #     json_string += '}'
    # if json_string.count('{') > json_string.count('}'):
    #     json_string += '}' * (json_string.count('{') - json_string.count('}'))

    try:
        json_data = json.loads(json_string)
    except json.JSONDecodeError as e:
        print(f"JSON decoding error for accommodation ID {accomm_id}: {e}")
        print("except Script Tag Content:", json_string)
        return {}, {}

    # 주소 정보 얻기
    address_info = json_data.get('mainEntity', {}).get('address', {})
    address = address_info.get('streetAddress', '')
    parts = address.split()
    
    location_table[accomm_id] = {
        "location_major": parts[0],
        "location_middle": parts[1],
        "location_sub": ' '.join(parts[2:])
    }

    # 부대시설 정보 얻기
    facility_info = json_data.get('mainEntity', {}).get('amenityFeature', [])
    facilities_list = [item['name'] for item in facility_info]

    facilities_status = motel_facilities_dict.copy()
    # 부대시설 상태 업데이트
    for facility in facilities_list:
        if facility in facilities_status:
            facilities_status[facility] = True

    for amenity, is_available in facilities_status.items():
        motel_facilities_table[accomm_id][amenity] = is_available

    return location_table, motel_facilities_table


def crawl_detail_page(base_url, accomm_ids, accomm_main_categories):
    driver = setup_driver()
    location_table = {}
    motel_facilities_table = {}
    
    try:
        for accomm_id in accomm_ids:
            index_num = accomm_ids.index(accomm_id)
            # if index_num == 30:
            #     break

            url = generate_url(base_url, accomm_id)
            html = fetch_page_source(driver, url)
            locations, motel_facilities = parse_detail_page_data(html, accomm_id, index_num, accomm_main_categories)
            
            if not locations or not motel_facilities:
                if not locations:
                    print('No data for locations exist')
                    # print("Cuz no return value from 'parse_main_page_info()'")
                    # print('Current URL:', url)
                if not motel_facilities:
                    print('No data for motel_facilities_table exist')
                    # print("Cuz no return value from 'parse_main_page_info()'")
                    # print('Current URL:', url)
                continue

            location_table.update(locations)
            motel_facilities_table.update(motel_facilities)
    finally:
        driver.quit()
    
    return location_table, motel_facilities_table

In [41]:
def create_dir_if_not_exists(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)


def create_parquet(table, index_name, column_mappings, filename_prefix):
    df = pd.DataFrame.from_dict(table, orient='index')
    df.index.name = index_name
    df.reset_index(inplace=True)
    df.rename(columns=column_mappings, inplace=True)
    
    create_dir_if_not_exists('tables')
    filename = os.path.join('tables', f'{filename_prefix}.parquet')
    
    df.to_parquet(filename, engine='pyarrow', index=False)
    print(f"저장 완료: {filename}")


def save_facilities_table_to_parquet(motel_facilities_table):
    column_mappings = {
        'accomm_main_category': 'accommodations_main_category',
        '트윈베드': 'exist_twinbed',
        '파티룸': 'exist_partyroom',
        '스파/월풀': 'exist_spa/whirlpool',
        '수영장': 'exist_swimming_pool',
        '공주룸': 'exist_princess_room',
        '노천탕': 'exist_open_air_bath',
        '거울룸': 'exist_mirror_room',
        '히노끼탕': 'exist_hinoki_bath',
        '맛사지 베드': 'exist_massage_bed',
        '반신욕': 'exist_half_bath',
        '욕실 TV': 'exist_bathroom_tv',
        '호수뷰': 'exist_lake_view',
        '복층룸': 'exist_duplex_room',
        '바다뷰': 'exist_sea_view',
        '하늘뷰': 'exist_sky_view',
        '야외테라스': 'exist_outdoor_terrace',
        '빔프로젝터': 'exist_beam_projector',
        '사우나/찜질방': 'exist_sauna',
        '3D TV': 'exist_3d_tv',
        '당구대': 'exist_billiard_table',
        '미니바': 'exist_minibar',
        '게임기': 'exist_game_console'
    }
    create_parquet(motel_facilities_table, 'accommodations_id', column_mappings, 'motel_facilities_table')


def save_location_table_to_parquet(location_table):
    column_mappings = {
        'location_major': 'location_major',
        'location_middle': 'location_middle',
        'location_sub': 'location_sub'
    }
    create_parquet(location_table, 'accommodations_id', column_mappings, 'location_table')

In [42]:
base_url = "https://www.yeogi.com/domestic-accommodations/"

# df = pd.read_csv('tables/accommodations_table.csv', usecols=['accommodations_id', 'accommodations_main_category'])
df = pd.read_parquet('tables/accommodations_table.parquet', columns=['accommodations_id', 'accommodations_main_category'])

accomm_ids = df['accommodations_id'].tolist()
accomm_main_categories = df['accommodations_main_category'].tolist()

# accomm_ids = [81322, 66425, 56409]

location_table, motel_facilities_table = crawl_detail_page(base_url, accomm_ids, accomm_main_categories)

In [43]:
save_facilities_table_to_parquet(motel_facilities_table)
save_location_table_to_parquet(location_table)

저장 완료: tables/motel_facilities_table.parquet
저장 완료: tables/location_table.parquet
