# Library

In [11]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from selenium.common.exceptions import NoSuchElementException
import time

import requests
import datetime

import pandas as pd

# Parameter

In [12]:
list_base_url = "https://www.yeogi.com/domestic-accommodations?"
region_list = ["경기", "제주도", "충남", "인천", "대구", "대전", "서울", "경남", "부산", "전북", "울산", "광주", "강원", "경북", "전남", "충북", "세종"]
Category = 2

In [13]:
def weekday_setting_url(base_url, Category, Keyword):
    # cron tab : 0 => 월요일
    monday_date = "2024-11-11" # cron 시 : datetime.datetime.now().strftime("%Y-%m-%d")
    friday_date = "2024-11-15" # cron 시 : (datetime.datetime.now() + datetime.timedelta(days=4)).strftime("%Y-%m-%d")
    
    weekday_parameter = f"keyword={Keyword}&autoKeyword=&checkIn={monday_date}&checkOut={friday_date}&personal=2&freeForm=false&category={Category}&" #page={page}
    
    url = base_url+weekday_parameter
    return url

def holiday_setting_url(base_url, Category, Keyword):
    saturday_date = "2024-11-16" # cron 시 : (datetime.datetime.now() + datetime.timedelta(days=5)).strftime("%Y-%m-%d")
    sunday_date = "2024-11-17" # cron 시 : (datetime.datetime.now() + datetime.timedelta(days=6)).strftime("%Y-%m-%d")
    
    holiday_parameter = f"keyword={Keyword}&autoKeyword=&checkIn={saturday_date}&checkOut={sunday_date}&personal=2&freeForm=false&category={Category}&" #page={page}
    
    url = base_url+holiday_parameter
    return url

#####################
# Setting Dataframe #
#####################
def Make_Basic_DataFrame():
    # Dataframe 셋팅
    accommodations_frame = {
        'accommodation_id': [],
        'accommodation_MainCategory': [],
        'accomdation_SubCtegory' : [],
        'accomodations_Name' : []
    }
    accommodations_df = pd.DataFrame(accommodations_frame)
        
    price_frame = {
        'accommodation_id': [],
        'Date_Type' : [],
        'price': []
    }
    price_df = pd.DataFrame(price_frame)
        
    review_frame = {
        'accommodation_id': [],
        'accommodations_Rating': [],
        'accomodations_Review' : []
    }
    review_df = pd.DataFrame(review_frame)
    return accommodations_df, price_df, review_df

####################
# Crawling Setting #
####################
def crawling_setting():
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode (without GUI)
    chrome_options.add_argument("--no-sandbox")  # Bypass OS security model
    chrome_options.add_argument("--disable-dev-shm-usage")  # Overcome limited resource problems

    # Initialize the Chrome driver
    driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=chrome_options)
    return driver

##################
# Crawling Basic #
##################
def extract_accommodation_price(element):  
    """Extract price from the accommodation element."""
    try:
        price_element = element.find_element(By.CLASS_NAME, "css-5r5920")
        price_text = price_element.text.strip().replace(',', '')  # Remove commas
        return int(price_text)  # Convert to integer
    except NoSuchElementException:
        print("Price element not found, skipping this element.")
        return None  # Return None if not found
    except Exception as e:
        print(f"An error occurred while extracting the price: {e}")
        return None  # Return None if any other error occurs

def extract_accommodation_data(element):
    """Extract various details from an accommodation element."""
    try:
        # Extract Accommodation ID
        href = element.get_attribute("href")
        Accommodation_ID = href.split('/')[-1].split('?')[0]

        # Extract Name
        Accommodation_Name = element.find_element(By.CLASS_NAME, "gc-thumbnail-type-seller-card-title").text.strip()

        # Extract Rating and Review
        review_rating = element.find_element(By.CLASS_NAME, "css-9ml4lz").text.strip()  # Rating
        review_count_text = element.find_element(By.CLASS_NAME, "css-oj6onp").text.strip()  # Review Count
        review_count_numeric = ''.join(filter(str.isdigit, review_count_text))  # Extract only digits
        review_count = int(review_count_numeric) if review_count_numeric else 0  # Default to 0 if empty

        # Extract Accommodation Subcategory
        ul_element = element.find_element(By.CLASS_NAME, "css-19akvy6")
        li_elements = ul_element.find_elements(By.TAG_NAME, "li")
        Accomodation_SubCategory = "_".join([li.text for li in li_elements])

        return Accommodation_ID, Accommodation_Name, review_rating, review_count, Accomodation_SubCategory
    
    except Exception as e:
        print(f"An error occurred while extracting data: {e}")
        return None  # Return None if any error occurs
    
def update_dataframes(accommodations_df, price_df, review_df, temp_accommodations, temp_prices, temp_reviews):
    """Update DataFrames with new data."""
    accommodations_df = pd.concat([accommodations_df, pd.DataFrame(temp_accommodations)], ignore_index=True)
    price_df = pd.concat([price_df, pd.DataFrame(temp_prices)], ignore_index=True)
    review_df = pd.concat([review_df, pd.DataFrame(temp_reviews)], ignore_index=True)
    return accommodations_df, price_df, review_df

def scrape_accommodation(driver, base_url, date_type, Category, Keyword_list):
    accommodation_df, price_df, review_df = Make_Basic_DataFrame()
    
    for Keyword in Keyword_list:
        print("Crawling Start KeyWord : ", Keyword )
        page = 1
        
        if date_type == "Weekday":
            Keyword_base_url = weekday_setting_url(base_url, Category, Keyword)
        else:
            Keyword_base_url = holiday_setting_url(base_url, Category, Keyword)   
        
        print("Crawling_baseurl : ", base_url)
        
        while True:
            url = Keyword_base_url+f"page={page}"
            print("crawling_url : ", url)
            driver.get(url)

            elements = driver.find_elements(By.CLASS_NAME, "gc-thumbnail-type-seller-card")
            
            if not elements:
                print("No more elements found. Exiting...")
                break
            
            temp_accommodations = []
            temp_prices = []
            temp_reviews = []
            
            for element in elements:
                # Extract price
                price = extract_accommodation_price(element)
                if price is None:
                    continue  # Skip this iteration if price is not found
                
                # Extract accommodation data
                extracted_data = extract_accommodation_data(element)
                if extracted_data is None:
                    continue  # Skip if there was an error extracting data
                
                Accommodation_ID, Accommodation_Name, review_rating, review_count, Accomodation_SubCategory = extracted_data
            
                temp_accommodations.append({
                        'accommodation_id': Accommodation_ID,
                        'accommodation_MainCategory': "Hotel_Resort",
                        'accomdation_SubCtegory': Accomodation_SubCategory,
                        'accomodations_Name': Accommodation_Name
                })

                temp_prices.append({
                    'accommodation_id': Accommodation_ID,
                    'Date_Type': "Holiday",  # Use the provided date type
                    'price': price
                })

                temp_reviews.append({
                    'accommodation_id': Accommodation_ID,
                    'accommodations_Rating': review_rating,
                    'accomodations_Review': review_count
                })
                
            accommodation_df, price_df, review_df = update_dataframes(accommodation_df, price_df, review_df, temp_accommodations, temp_prices, temp_reviews)
            page += 1
            
    return accommodation_df, price_df, review_df

In [14]:
driver = crawling_setting()

accommodation_df, price_df, review_df  = scrape_accommodation(driver, list_base_url, "Weekday", Category, region_list)

Crawling Start KeyWord :  경기
Crawling_baseurl :  https://www.yeogi.com/domestic-accommodations?
crawling_url :  https://www.yeogi.com/domestic-accommodations?keyword=경기&autoKeyword=&checkIn=2024-11-11&checkOut=2024-11-15&personal=2&freeForm=false&category=2&page=1
crawling_url :  https://www.yeogi.com/domestic-accommodations?keyword=경기&autoKeyword=&checkIn=2024-11-11&checkOut=2024-11-15&personal=2&freeForm=false&category=2&page=2
crawling_url :  https://www.yeogi.com/domestic-accommodations?keyword=경기&autoKeyword=&checkIn=2024-11-11&checkOut=2024-11-15&personal=2&freeForm=false&category=2&page=3
crawling_url :  https://www.yeogi.com/domestic-accommodations?keyword=경기&autoKeyword=&checkIn=2024-11-11&checkOut=2024-11-15&personal=2&freeForm=false&category=2&page=4
crawling_url :  https://www.yeogi.com/domestic-accommodations?keyword=경기&autoKeyword=&checkIn=2024-11-11&checkOut=2024-11-15&personal=2&freeForm=false&category=2&page=5
crawling_url :  https://www.yeogi.com/domestic-accommodatio

In [8]:
accommodation_df

Unnamed: 0,accommodation_id,accommodation_MainCategory,accomdation_SubCtegory,accomodations_Name
0,69476,Hotel_Resort,블랙_4성급_호텔,테이크 호텔 서울광명
1,75023,Hotel_Resort,레지던스_호텔,르컬렉티브 시흥 웨이브파크
2,67765,Hotel_Resort,4성급_호텔,나인트리 바이 파르나스 서울 판교
3,6877,Hotel_Resort,3성급_호텔,[반짝특가] 신라스테이 동탄
4,58125,Hotel_Resort,3성급_호텔,라마다 앙코르 바이 윈덤 김포 한강
...,...,...,...,...
1726,67758,Hotel_Resort,4성급_호텔,베스트웨스턴 플러스 호텔 세종
1727,62500,Hotel_Resort,비즈니스_호텔,오송 H호텔 세종시티
1728,6673,Hotel_Resort,3성급_호텔,세종호텔
1729,45159,Hotel_Resort,3성급_호텔,춘천 세종호텔


In [9]:
price_df

Unnamed: 0,accommodation_id,Date_Type,price
0,69476,Holiday,135000.0
1,75023,Holiday,66624.0
2,67765,Holiday,157650.0
3,6877,Holiday,126660.0
4,58125,Holiday,118342.0
...,...,...,...
1726,67758,Holiday,160000.0
1727,62500,Holiday,150000.0
1728,6673,Holiday,172500.0
1729,45159,Holiday,72800.0


In [10]:
review_df

Unnamed: 0,accommodation_id,accommodations_Rating,accomodations_Review
0,69476,9.3,1297.0
1,75023,9.3,702.0
2,67765,9.4,934.0
3,6877,9.5,2926.0
4,58125,9.0,1580.0
...,...,...,...
1726,67758,9.4,263.0
1727,62500,9.3,275.0
1728,6673,9.0,766.0
1729,45159,8.4,221.0
