# Library

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from selenium.common.exceptions import NoSuchElementException
import time

import requests
import datetime

import os

import pandas as pd

# Parameter

In [2]:
list_base_url = "https://www.yeogi.com/domestic-accommodations?"
region_list = ["경기", "제주도", "충남", "인천", "대구", "대전", "서울", "경남", "부산", "전북", "울산", "광주", "강원", "경북", "전남", "충북", "세종"]
Category_Mapping_table = {
    1 : "Motel",
    2 : "Hotel/Resort",
    5 : "Camping"
}

# DEF : Extract and Transformation

In [3]:
def weekday_setting_url(base_url, Category, Keyword):
    # cron tab : 0 => 월요일
    monday_date = "2024-11-18" # cron 시 : datetime.datetime.now().strftime("%Y-%m-%d")
    friday_date = "2024-11-22" # cron 시 : (datetime.datetime.now() + datetime.timedelta(days=4)).strftime("%Y-%m-%d")
    
    weekday_parameter = f"keyword={Keyword}&autoKeyword=&checkIn={monday_date}&checkOut={friday_date}&personal=2&freeForm=false&category={Category}&" #page={page}
    
    url = base_url+weekday_parameter
    return url

def holiday_setting_url(base_url, Category, Keyword):
    saturday_date = "2024-11-23" # cron 시 : (datetime.datetime.now() + datetime.timedelta(days=5)).strftime("%Y-%m-%d")
    sunday_date = "2024-11-24" # cron 시 : (datetime.datetime.now() + datetime.timedelta(days=6)).strftime("%Y-%m-%d")
    
    holiday_parameter = f"keyword={Keyword}&autoKeyword=&checkIn={saturday_date}&checkOut={sunday_date}&personal=2&freeForm=false&category={Category}&" #page={page}
    
    url = base_url+holiday_parameter
    return url

#####################
# Setting Dataframe #
#####################
def Make_Basic_DataFrame():
    # Dataframe 셋팅
    accommodations_frame = {
        'ACCOMMODATION_ID': [],
        'ACCOMMODATION_MAINCATEGORY': [],
        'ACCOMMODATION_SUBCATEGORY': [],
        'ACCOMMODATION_NAME': []
    }
    accommodations_df = pd.DataFrame(accommodations_frame)
        
    price_frame = {
        'ACCOMMODATION_ID': [],
        'DATE_TYPE' : [],
        'PRICE': []
    }
    price_df = pd.DataFrame(price_frame)
        
    review_frame = {
        'ACCOMMODATION_ID': [],
        'ACCOMMODATION_RATING': [],
        'ACCOMMODATION_REVIEWCOUNT' : []
    }
    review_df = pd.DataFrame(review_frame)
    return accommodations_df, price_df, review_df

####################
# Crawling Setting #
####################
def crawling_setting():
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode (without GUI)
    chrome_options.add_argument("--no-sandbox")  # Bypass OS security model
    chrome_options.add_argument("--disable-dev-shm-usage")  # Overcome limited resource problems

    # Initialize the Chrome driver
    driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=chrome_options)
    return driver

##################
# Crawling Basic #
##################
def extract_accommodation_price(element):  
    """Extract price from the accommodation element."""
    try:
        price_element = element.find_element(By.CLASS_NAME, "css-5r5920")
        price_text = price_element.text.strip().replace(',', '')  # Remove commas
        return int(price_text)  # Convert to integer
    except NoSuchElementException:
        print("Price element not found, skipping this element.")
        return None  # Return None if not found
    except Exception as e:
        print(f"An error occurred while extracting the price: {e}")
        return None  # Return None if any other error occurs

def extract_accommodation_data(element):
    """Extract various details from an accommodation element."""
    try:
        # Extract Accommodation ID
        href = element.get_attribute("href")
        Accommodation_ID = href.split('/')[-1].split('?')[0]

        # Extract Name
        Accommodation_Name = element.find_element(By.CLASS_NAME, "gc-thumbnail-type-seller-card-title").text.strip()

        # Extract Rating and Review
        review_rating = element.find_element(By.CLASS_NAME, "css-9ml4lz").text.strip()  # Rating
        review_count_text = element.find_element(By.CLASS_NAME, "css-oj6onp").text.strip()  # Review Count
        review_count_numeric = ''.join(filter(str.isdigit, review_count_text))  # Extract only digits
        review_count = int(review_count_numeric) if review_count_numeric else 0  # Default to 0 if empty

        # Extract Accommodation Subcategory
        ul_element = element.find_element(By.CLASS_NAME, "css-19akvy6")
        li_elements = ul_element.find_elements(By.TAG_NAME, "li")
        Accomodation_SubCategory = "_".join([li.text for li in li_elements])

        return Accommodation_ID, Accommodation_Name, review_rating, review_count, Accomodation_SubCategory
    
    except Exception as e:
        print(f"An error occurred while extracting data: {e}")
        return None  # Return None if any error occurs
    
def update_dataframes(accommodations_df, price_df, review_df, temp_accommodations, temp_prices, temp_reviews):
    """Update DataFrames with new data."""
    accommodations_df = pd.concat([accommodations_df, pd.DataFrame(temp_accommodations)], ignore_index=True)
    price_df = pd.concat([price_df, pd.DataFrame(temp_prices)], ignore_index=True)
    review_df = pd.concat([review_df, pd.DataFrame(temp_reviews)], ignore_index=True)
    return accommodations_df, price_df, review_df

def scrape_accommodation(driver, base_url, date_type, Category, Keyword_list):
    accommodation_df, price_df, review_df = Make_Basic_DataFrame()
    
    for Keyword in Keyword_list:
        print("Crawling Start KeyWord : ", Keyword )
        page = 1
        
        if date_type == "Weekday":
            Keyword_base_url = weekday_setting_url(base_url, Category, Keyword)
        elif date_type == "Holiday":
            Keyword_base_url = holiday_setting_url(base_url, Category, Keyword)   
        else:
            print("Date_Type Error")
            return None, None, None
        
        print("Crawling_baseurl : ", base_url)
        
        while True:
            url = Keyword_base_url+f"page={page}"
            print("crawling_url : ", url)
            driver.get(url)

            elements = driver.find_elements(By.CLASS_NAME, "gc-thumbnail-type-seller-card")
            
            if not elements:
                print("No more elements found. Exiting...")
                break
            
            temp_accommodations = []
            temp_prices = []
            temp_reviews = []
            
            for element in elements:
                # Extract price
                price = extract_accommodation_price(element)
                if price is None:
                    continue  # Skip this iteration if price is not found
                
                # Extract accommodation data
                extracted_data = extract_accommodation_data(element)
                if extracted_data is None:
                    continue  # Skip if there was an error extracting data
                
                Accommodation_ID, Accommodation_Name, review_rating, review_count, Accomodation_SubCategory = extracted_data
            
                temp_accommodations.append({
                        'ACCOMMODATION_ID': Accommodation_ID,
                        'ACCOMMODATION_MAINCATEGORY': Category_Mapping_table.get(Category),
                        'ACCOMMODATION_SUBCATEGORY': Accomodation_SubCategory,
                        'ACCOMMODATION_NAME': Accommodation_Name
                })

                temp_prices.append({
                    'ACCOMMODATION_ID': Accommodation_ID,
                    'DATE_TYPE': date_type,  # Use the provided date type
                    'PRICE': price
                })

                temp_reviews.append({
                    'ACCOMMODATION_ID': Accommodation_ID,
                    'ACCOMMODATION_RATING': review_rating,
                    'ACCOMMODATION_REVIEWCOUNT': review_count
                })
                
            accommodation_df, price_df, review_df = update_dataframes(accommodation_df, price_df, review_df, temp_accommodations, temp_prices, temp_reviews)
            page += 1
            
    return accommodation_df, price_df, review_df

# DEF : LOAD TO PARQUET

In [None]:
# local -> boto3 s3로 변경 필요
def create_dir_if_not_exists():
    now = datetime.datetime.now().strftime("%Y-%m-%d")
    directory = "./" + now + "_Tables"
    if not os.path.exists(directory):
        os.makedirs(directory)
    return directory

# Parquet 파일 저장 함수
def create_parquet_Basic(directory, accommodation_df, review_df, price_df):
    # Accommodation 데이터프레임 저장
    accommodation_filename = os.path.join(directory, 'accommodation_table.parquet')
    accommodation_df.to_parquet(accommodation_filename, engine='pyarrow', index=False)
    print(f"저장 완료: {accommodation_filename}")
    
    # Review 데이터프레임 저장
    review_filename = os.path.join(directory, 'review_table.parquet')
    review_df.to_parquet(review_filename, engine='pyarrow', index=False)
    print(f"저장 완료: {review_filename}")
    
    # Price 데이터프레임 저장
    price_filename = os.path.join(directory, 'price_table.parquet')
    price_df.to_parquet(price_filename, engine='pyarrow', index=False)
    print(f"저장 완료: {price_filename}")


# SCRAPING

In [9]:
driver = crawling_setting()

# for Category in Category_List:
accommodation_Weekday_df, price_Weekday_df, review_Weekday_df  = scrape_accommodation(driver, list_base_url, "Weekday", Category, region_list)
accommodation_Holiday_df, price_Holiday_df, review_Holiday_df  = scrape_accommodation(driver, list_base_url, "Holiday", Category, region_list)# weekday + holiday
# 기본 정보 중복값 제거
accommodation_df = pd.concat([accommodation_Weekday_df, accommodation_Holiday_df], ignore_index=True)
accommodation_df = accommodation_df.drop_duplicates(['ACCOMMODATION_ID'])
accommodation_df = accommodation_df.astype({'ACCOMMODATION_ID': 'int64'})
# total_accommodation = pd.concat([total_accmmodation_df, accommodation_df], ignore_index=True)

# weekday + holiday
# 리뷰 데이터 중복값 제거
review_df = pd.concat([review_Weekday_df, review_Holiday_df], ignore_index=True)
review_df = review_df.drop_duplicates(['ACCOMMODATION_ID'])
review_df = review_df.astype({'ACCOMMODATION_ID': 'int64', 'ACCOMMODATION_REVIEWCOUNT': 'int64'})
# total_review = pd.concat([total_review_df, review_df], ignore_index=True)

# weekday + holiday
price_df = pd.concat([price_Weekday_df, price_Holiday_df], ignore_index=True)
price_df = price_df.astype({'ACCOMMODATION_ID': 'int64', 'PRICE': 'int64'})
# total_price = pd.concat([total_price_df, price_df], ignore_index=True)



KeyboardInterrupt



In [None]:
accommodation_df

In [None]:
review_df

In [None]:
price_df

# Load

In [None]:
directory = create_dir_if_not_exists()
create_parquet_Basic(directory, accommodation_df, review_df, price_df)

저장 완료: ./2024-11-03_Tables\accommodation_table.parquet
저장 완료: ./2024-11-03_Tables\review_table.parquet
저장 완료: ./2024-11-03_Tables\price_table.parquet


---

# Crawling Detail

In [5]:
directory = "./2024-11-03_Tables"

accommodation_df = pd.read_parquet(directory+"/accommodation_table.parquet")
# 'ACCOMMODATION_ID'와 'ACCOMMODATION_MAINCATEGORY' 컬럼을 선택합니다.
accommodation_id_category_list = list(zip(accommodation_df['ACCOMMODATION_ID'], accommodation_df['ACCOMMODATION_MAINCATEGORY']))

accommodation_id_category_list

[(69476, 'Hotel_Resort'),
 (75023, 'Hotel_Resort'),
 (67765, 'Hotel_Resort'),
 (6877, 'Hotel_Resort'),
 (58125, 'Hotel_Resort'),
 (47787, 'Hotel_Resort'),
 (74073, 'Hotel_Resort'),
 (67751, 'Hotel_Resort'),
 (7037, 'Hotel_Resort'),
 (55458, 'Hotel_Resort'),
 (68825, 'Hotel_Resort'),
 (80655, 'Hotel_Resort'),
 (81286, 'Hotel_Resort'),
 (81859, 'Hotel_Resort'),
 (46258, 'Hotel_Resort'),
 (7005, 'Hotel_Resort'),
 (58951, 'Hotel_Resort'),
 (67222, 'Hotel_Resort'),
 (53770, 'Hotel_Resort'),
 (48662, 'Hotel_Resort'),
 (75303, 'Hotel_Resort'),
 (13568, 'Hotel_Resort'),
 (65114, 'Hotel_Resort'),
 (67757, 'Hotel_Resort'),
 (72413, 'Hotel_Resort'),
 (6792, 'Hotel_Resort'),
 (66889, 'Hotel_Resort'),
 (55288, 'Hotel_Resort'),
 (79298, 'Hotel_Resort'),
 (64377, 'Hotel_Resort'),
 (6301, 'Hotel_Resort'),
 (10720, 'Hotel_Resort'),
 (63662, 'Hotel_Resort'),
 (70223, 'Hotel_Resort'),
 (45047, 'Hotel_Resort'),
 (6994, 'Hotel_Resort'),
 (53532, 'Hotel_Resort'),
 (72261, 'Hotel_Resort'),
 (47945, 'Hotel_Re

# DEF : Crawling Detail

In [12]:
def Make_Detail_DataFrame():
    # Dataframe 셋팅
    accommodations_Location_frame = {
        'ACCOMMODATION_ID': [],
        'ACCOMMODATION_LOCATION_MAJOR': [],
        'ACCOMMODATION_LOCATION_MIDDLE': [],
        'ACCOMMODATION_LOCATION_SUB': [],
        'ACCOMMODATION_LOCATION_DETAIL': []
    }
    Location_df = pd.DataFrame(accommodations_Location_frame)
        
    accommodation_Facilities_frame = {
        'ACCOMMODATION_ID': [],
        'ACCOMMODATION_MAINCATEGORY': [],
        'ACCOMMODATION_FACILITIES': []
    }
    Facilities_df = pd.DataFrame(accommodation_Facilities_frame)
    
    return Location_df, Facilities_df

def extract_accommodation_location(driver):
    address_span = driver.find_element(By.CSS_SELECTOR, "div.css-z8nsir span.css-1t5t2dt")
    # Get the text from the <span> element
    address_text = address_span.text.strip()

    # Split the text by spaces
    parts = address_text.split()

    # Prepare the variables based on the number of parts
    if len(parts) >= 4:
        first_part = parts[0]
        second_part = parts[1]
        third_part = parts[2]
        fourth_part = ' '.join(parts[3:])  # Join remaining parts as fourth_part
    elif len(parts) == 3:
        first_part = parts[0]
        second_part = parts[1]
        third_part = parts[2]
        fourth_part = None  # Or '' if you prefer an empty string
    else:
        # Handle the case with fewer than 3 parts
        first_part = parts[0] if len(parts) > 0 else None
        second_part = parts[1] if len(parts) > 1 else None
        third_part = parts[2] if len(parts) > 2 else None
        fourth_part = None  # Or however you want to handle this case
    return first_part, second_part, third_part, fourth_part

def extract_accommodation_facilities(driver):
    # 모든 "css-i3rab1" 클래스를 가진 요소를 가져옴
    elements = driver.find_elements(By.CLASS_NAME, "css-i3rab1")

    # 각 요소의 텍스트를 가져와서 리스트에 저장
    texts = [element.text for element in elements]

    # 텍스트를 ','로 구분하여 하나의 문자열로 합침
    Facilities_text = ", ".join(texts)
    return Facilities_text

def update_detail_dataframe(accommodation_id, Location_df, Facilities_df, first_part, second_part, third_part, fourth_part, Facilities, Category):
    Location_df = pd.concat([Location_df, pd.DataFrame({'ACCOMMODATION_ID': [accommodation_id], 
                                                        'ACCOMMODATION_LOCATION_MAJOR': [first_part], 
                                                        'ACCOMMODATION_LOCATION_MIDDLE': [second_part], 
                                                        'ACCOMMODATION_LOCATION_SUB': [third_part], 
                                                        'ACCOMMODATION_LOCATION_DETAIL': [fourth_part]})], ignore_index=True)
    
    Facilities_df = pd.concat([Facilities_df, pd.DataFrame({'ACCOMMODATION_ID': [accommodation_id],
                                                            'ACCOMMODATION_MAINCATEGORY' : [Category],
                                                            'ACCOMMODATION_FACILITIES': [Facilities]})], ignore_index=True)
    
    return Location_df, Facilities_df

def crawl_detail_page(driver, accommodation_id_category_list):
    Location_df, Facilities_df = Make_Detail_DataFrame()
    
    for accommodation_id, main_category in accommodation_id_category_list:
        url = f"https://www.yeogi.com/domestic-accommodations/{accommodation_id}" + "?"
        driver.get(url)
        
        time.sleep(2)
        
        print("crawlingurl : ", url)
        # 데이터 추출
        first_part, second_part, third_part, fourth_part = extract_accommodation_location(driver)
        Facilities = extract_accommodation_facilities(driver)
        
        Location_df, Facilities_df = update_detail_dataframe(accommodation_id, Location_df, Facilities_df, first_part, second_part, third_part, fourth_part, Facilities, main_category)
        print("crawlingurl : ", url , "end")
    return Location_df, Facilities_df

# Parquet 파일 저장 함수
def create_parquet_Detail(directory, Location_df, Facilities_df):
    # Accommodation 데이터프레임 저장
    Location_filename = os.path.join(directory, 'accommodation_Location_table.parquet')
    Location_df.to_parquet(Location_filename, engine='pyarrow', index=False)
    print(f"저장 완료: {Location_filename}")
    
    # Review 데이터프레임 저장
    Facilities_filename = os.path.join(directory, 'accommodation_Facilities_table.parquet')
    Facilities_df.to_parquet(Facilities_filename, engine='pyarrow', index=False)
    print(f"저장 완료: {Facilities_filename}")

# Testing

In [8]:
driver = crawling_setting()

In [9]:
Location_df, Facilities_df = crawl_detail_page(driver, accommodation_id_category_list)

crawlingurl :  https://www.yeogi.com/domestic-accommodations/69476?
crawlingurl :  https://www.yeogi.com/domestic-accommodations/69476? end
crawlingurl :  https://www.yeogi.com/domestic-accommodations/75023?
crawlingurl :  https://www.yeogi.com/domestic-accommodations/75023? end
crawlingurl :  https://www.yeogi.com/domestic-accommodations/67765?
crawlingurl :  https://www.yeogi.com/domestic-accommodations/67765? end
crawlingurl :  https://www.yeogi.com/domestic-accommodations/6877?
crawlingurl :  https://www.yeogi.com/domestic-accommodations/6877? end
crawlingurl :  https://www.yeogi.com/domestic-accommodations/58125?
crawlingurl :  https://www.yeogi.com/domestic-accommodations/58125? end
crawlingurl :  https://www.yeogi.com/domestic-accommodations/47787?
crawlingurl :  https://www.yeogi.com/domestic-accommodations/47787? end
crawlingurl :  https://www.yeogi.com/domestic-accommodations/74073?
crawlingurl :  https://www.yeogi.com/domestic-accommodations/74073? end
crawlingurl :  https:/

In [10]:
Location_df

Unnamed: 0,ACCOMMODATION_ID,ACCOMMODATION_LOCATION_MAJOR,ACCOMMODATION_LOCATION_MIDDLE,ACCOMMODATION_LOCATION_SUB,ACCOMMODATION_LOCATION_DETAIL
0,69476.0,경기,광명시,일직동,512-3
1,75023.0,경기,시흥시,정왕동,2725
2,67765.0,경기,성남시,수정구,시흥동 296-3
3,6877.0,경기,화성시,반송동,92-6
4,58125.0,경기,김포시,고촌읍,전호리 634
...,...,...,...,...,...
1975,65347.0,충북,음성군,맹동면,두성리 1471
1976,68100.0,충북,청주시,흥덕구,가경동 1017
1977,7407.0,충북,청주시,상당구,명암동 2-1
1978,78985.0,충북,청주시,흥덕구,가경동 1456-5


In [11]:
Facilities_df

Unnamed: 0,ACCOMMODATION_ID,ACCOMMODATION_MAINCATEGORY,ACCOMMODATION_FACILITIES
0,69476.0,Hotel_Resort,"수영장, 무선인터넷, 욕실용품, 레스토랑, 금연, TV, 에어컨, 냉장고, 샤워실,..."
1,75023.0,Hotel_Resort,"무선인터넷, 욕실용품, 금연, TV, 객실내취사, 에어컨, 냉장고, 짐보관가능, 샤..."
2,67765.0,Hotel_Resort,"피트니스, 수영장, 무선인터넷, 욕실용품, 레스토랑, 금연, TV, 에어컨, 냉장고..."
3,6877.0,Hotel_Resort,"피트니스, 미니바, 무선인터넷, 욕실용품, 레스토랑, 금연, TV, 카페, 장애인편의"
4,58125.0,Hotel_Resort,"피트니스, 무선인터넷, 욕실용품, 레스토랑, 금연, TV, 냉장고, 샤워실, 무료주..."
...,...,...,...
1975,65347.0,Hotel_Resort,"무선인터넷, 욕실용품, 금연, TV, 에어컨, 냉장고, 짐보관가능, 샤워실, 무료주..."
1976,68100.0,Hotel_Resort,"무선인터넷, 욕실용품, 레스토랑, 금연, TV, 에어컨, 냉장고, 샤워실, 욕조, ..."
1977,7407.0,Hotel_Resort,"무선인터넷, 욕실용품, 레스토랑, 금연, 샤워실, 무료주차, 카페, 주차장"
1978,78985.0,Hotel_Resort,"반려견동반, 조식제공, 무선인터넷, 욕실용품, 금연, TV, 에어컨, 냉장고, 짐보..."


In [14]:
Location_df = Location_df.astype({'ACCOMMODATION_ID': 'int64'})
Facilities_df = Facilities_df.astype({'ACCOMMODATION_ID': 'int64'})

directory = "./2024-11-03_Tables"
create_parquet_Detail(directory, Location_df, Facilities_df)

저장 완료: ./2024-11-03_Tables\accommodation_Location_table.parquet
저장 완료: ./2024-11-03_Tables\accommodation_Facilities_table.parquet
