In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import plotly_express as px  # import plotly.express as px
import plotly.graph_objects as go

In [2]:
def fetch_booking_data(location, checkin_date, checkout_date):
    url = f'https://www.booking.com/searchresults.html?ss={location_input}&checkin={checkin_date_input}&checkout={checkout_date_input}&group_adults=2&no_rooms=1&group_children=0&lang=zh-tw&soz=1&lang_changed=1'
    headers = {
        'User-Agent': 'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36',
        'Accept-Language': 'en-US, en;q=0.5'
    }
    list_ = ['', '&offset=25', '&offset=50', '&offset=75', '&offset=100']
    hotels_data = []

    # change the next page
    for page in list_:
        if len(hotels_data) > 99:
            break
        response  = requests.get(url + page, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all the hotel elements in the HTML document
        hotels = soup.findAll('div', {'data-testid': 'property-card'})

        # Loop over the hotel elements and extract the desired data
        for hotel in hotels:
            # Extract the hotel name
            name_element = hotel.find('div', {'data-testid': 'title'})
            name = name_element.text.strip()

            # Extract the hotel location
            location_element = hotel.find('span', {'data-testid': 'address'})
            location = location_element.text.strip()

            # Extract the hotel price
            price_element = hotel.find('span', {'data-testid': 'price-and-discounted-price'})
            if price_element:
                price = price_element.text.strip()
            else:
                price = 'N/A'
            
            # Extract the hotel rating
            rating_element = hotel.find('div', {'class': 'a3b8729ab1 d86cee9b25'})
            if rating_element:
                rating = rating_element.text.strip()
            else:
                rating = 'N/A'
            
            # Extract the hotel distance
            distance_element = hotel.find('span', {'data-testid': 'distance'})
            if distance_element:
                distance = distance_element.text.strip()
            else:
                distance = 'N/A'
            #  Extract the hotel comments
            comments_element = hotel.find('div', {'class': 'a3b8729ab1 e6208ee469 cb2cbb3ccb'})
            if comments_element:
                comments = comments_element.text.strip()
            else:
                comments = 'N/A'

            # data cleaning
            check_list = [name, location, price, rating, distance, comments]
            if 'N/A' in check_list or name in [i['name'] for i in hotels_data]:
                continue
            else:
                price = int(''.join(c for c in price if c.isdigit()))
                rating = float(rating)
                distance = float(''.join(c for c in distance if c.isdigit())) / 1000

            # Append hotes_data with info about hotel
            hotels_data.append({
                'name': name,
                'location': location,
                'price': price,
                'rating': rating,
                'distance': distance,
                'comments': comments
            })
            if len(hotels_data) > 99:
                break

    hotels = pd.DataFrame(hotels_data)
    hotels.to_csv('hotels.csv', header=True, index=False)

    # Step 3: Data Visualization
    fig = px.scatter(
        hotels,  # 数据集
        x="price",  # x轴
        y="distance",  # y轴
        color="rating",  # 指定颜色
        title="Hotel Price and Distance Scatter Plot", # 標題
        hover_name="name",
        hover_data=['price', 'distance', 'rating']
    )
    fig.show()


In [3]:
location_input = 'Taipei'  # Replace with your desired location
checkin_date_input = '2023-12-14'  # Replace with your desired check-in date
checkout_date_input = '2023-12-15'  # Replace with your desired check-out date

result = fetch_booking_data(location_input, checkin_date_input, checkout_date_input)

