In [8]:
import dash
from dash import html, dcc, Input, Output, State
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import numpy as np
import plotly.express as px

In [9]:
def fetch_hotel_data(city, checkin_date, checkout_date):
    checkin_date = datetime.strptime(checkin_date, "%Y-%m-%d").strftime("%Y-%m-%d")
    checkout_date = datetime.strptime(checkout_date, "%Y-%m-%d").strftime("%Y-%m-%d")

    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
        'Accept-Language': 'en-US, en;q=0.5'
    }

    hotels = []

    # Loop through first 4 pages
    for page in range(4):
        offset = page * 25
        base_url = "https://www.booking.com/searchresults.en-gb.html?ss={}&checkin={}&checkout={}&offset={}"

        url = base_url.format(city, checkin_date, checkout_date, offset)
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')

        information = soup.find('h1', class_='f6431b446c d5f78961c3').get_text(strip=True) if soup.find('h1', class_='f6431b446c d5f78961c3') else "No Information"
        if information == "No Information":
            break

        hotel_boxes = soup.find_all("div", class_="c82435a4b8 a178069f51 a6ae3c2b40 a18aeea94d d794b7a0f7 f53e278e95 c6710787a4")
        
        if not hotel_boxes:
            break

        for hotel_box in hotel_boxes:
            name = hotel_box.find('div', class_='f6431b446c a15b38c233').get_text(strip=True) if hotel_box.find('div', class_='f6431b446c a15b38c233') else "No Name"
            location = hotel_box.find('span', class_='aee5343fdb def9bc142a').get_text(strip=True) if hotel_box.find('span', class_='aee5343fdb def9bc142a') else "No Location"
            price = hotel_box.find('span', class_="f6431b446c fbfd7c1165 e84eb96b1f").get_text(strip=True) if hotel_box.find('span', class_="f6431b446c fbfd7c1165 e84eb96b1f") else "No Price"
            rating = hotel_box.find("div", class_="a3b8729ab1 d86cee9b25").get_text(strip=True) if hotel_box.find("div", class_="a3b8729ab1 d86cee9b25") else "No Rating"
            distance = hotel_box.find("span", {"data-testid": "distance"}).get_text(strip=True) if hotel_box.find("span", {"data-testid": "distance"}) else "No Distance"
            comment = hotel_box.find('div', class_='a3b8729ab1 e6208ee469 cb2cbb3ccb').get_text(strip=True) if hotel_box.find('div', class_='a3b8729ab1 e6208ee469 cb2cbb3ccb') else "No Comment"

            hotels.append({"Name": name, "Location": location, "Price": price, "Rating": rating, "Distance": distance, "Comment": comment})

    return pd.DataFrame(hotels)

city = "Berlin"
checkin_date = "2023-12-01"
checkout_date = "2023-12-05"

df = fetch_hotel_data(city, checkin_date, checkout_date)
print(df.head())

                                                Name  \
0                      Scandic Berlin Kurfürstendamm   
1  Hampton by Hilton Berlin City Centre Alexander...   
2  Hotel Berlin, Berlin, a member of Radisson Ind...   
3         Leonardo Royal Hotel Berlin Alexanderplatz   
4                      Motel One Berlin-Spittelmarkt   

                             Location       Price Rating            Distance  \
0  Charlottenburg-Wilmersdorf, Berlin  TWD 22,426    8.1  3.2 km from centre   
1                       Mitte, Berlin  TWD 20,339    8.1    3 km from centre   
2                       Mitte, Berlin  TWD 16,441    7.9  2.1 km from centre   
3    Friedrichshain-Kreuzberg, Berlin  TWD 15,474    8.4  3.4 km from centre   
4                       Mitte, Berlin  TWD 18,414    8.7  1.7 km from centre   

     Comment  
0  Very good  
1  Very good  
2       Good  
3  Very good  
4   Fabulous  


In [10]:

# STEP 2
def process_hotel_data(df):
    df['Price'] = df['Price'].astype(str)
    df['Price'] = df['Price'].str.replace(',', '').str.replace('TWD\xa0', '')
    df['Price'] = df['Price'].replace('No Price', np.nan)
    df['Price'] = df['Price'].astype('Int64')
    df['Price'].fillna(0, inplace=True)

    df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce').astype(float)
    df['Comment'] = df['Comment'].astype(str)

    def convert_distance(dist):
        if pd.isna(dist):
            return np.nan
        if 'km' in dist:
            return float(dist.split(' ')[0])
        if 'm' in dist:
            return float(dist.split(' ')[0]) / 1000
        return np.nan

    df['Distance'] = df['Distance'].astype(str).apply(convert_distance)

    return df

df = process_hotel_data(df)
print(df.head())

                                                Name  \
0                      Scandic Berlin Kurfürstendamm   
1  Hampton by Hilton Berlin City Centre Alexander...   
2  Hotel Berlin, Berlin, a member of Radisson Ind...   
3         Leonardo Royal Hotel Berlin Alexanderplatz   
4                      Motel One Berlin-Spittelmarkt   

                             Location  Price  Rating  Distance    Comment  
0  Charlottenburg-Wilmersdorf, Berlin  22426     8.1       3.2  Very good  
1                       Mitte, Berlin  20339     8.1       3.0  Very good  
2                       Mitte, Berlin  16441     7.9       2.1       Good  
3    Friedrichshain-Kreuzberg, Berlin  15474     8.4       3.4  Very good  
4                       Mitte, Berlin  18414     8.7       1.7   Fabulous  


In [11]:
df_plot = df.dropna(subset=['Price', 'Distance', 'Rating'])

fig = px.scatter(df_plot, x='Price', y='Distance', color='Rating', hover_name="Name", hover_data=['Location', 'Comment'],
                         labels={"Price": "Price", "Distance": "Distance (km)", "Rating": "Rating"},
                         title="Hotel Price and Distance Scatter Plot")

# Show the plot
fig.show()