In [1]:
import time
import pandas as pd
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from datetime import datetime, timedelta
import re
import random

# Configure WebDriver options
options = webdriver.ChromeOptions()
options.add_argument('--no-sandbox')
options.add_argument(
    "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36")
options.add_argument("--lang=en")
options.add_argument('--disable-dev-shm-usage')

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Define search parameters
city = "New York (and vicinity), New York, United States of America"
numOfAdults = 2
numOfChild = 0
numOfRooms = 1

# Expedia base URL
base_url = "https://www.expedia.com/Hotel-Search"

# Define snapshot date (current date)
snapshot_date = datetime.today()

# CSV filename (saving to the Data folder)
os.makedirs("Data", exist_ok=True)
csv_filename = os.path.join("Data", f"expedia_results - {snapshot_date.strftime('%d.%m.%Y')}.csv")

# Create DataFrame for storing results
df = pd.DataFrame(columns=[
    "Snapshot Date", "TTT", "LOS", "Hotel Name", "Price",
    "Rating", "Reviews", "Neighborhood", "Breakfast", "Free Cancellation", "Rooms Left",
    "Check-in", "Check-out"
])


# Function to update check-in and check-out dates
def update_dates(start_date, checkin_days_to_add, checkout_days_to_add):
    start_date = datetime.strptime(start_date, "%Y-%m-%d")
    checkin_date = start_date + timedelta(days=checkin_days_to_add)
    checkout_date = checkin_date + timedelta(days=checkout_days_to_add)
    return checkin_date.strftime("%Y-%m-%d"), checkout_date.strftime("%Y-%m-%d")


# Function to close popup if it exists
def close_popup_if_exists():
    try:
        popup_close_button = WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.XPATH, "//button[@aria-label='Dismiss sign in information.']"))
        )
        popup_close_button.click()
        time.sleep(random.uniform(1, 2))  # Prevent rapid actions
    except (TimeoutException, NoSuchElementException):
        pass  # No popup detected


# Function to save progress to CSV every 100 hotels
def save_progress():
    global df
    if not df.empty:
        df.to_csv(csv_filename, index=False)
        print(f"Saved {len(df)} hotels to CSV ({csv_filename}).")


# Loop over TTT (Time to Travel) and LOS (Length of Stay)
search_date = datetime.now()

for TTT in range(0, 31):  # Start from today
    for LOS in range(1, 6):

        # Update check-in and check-out dates
        checkin_date, checkout_date = update_dates(search_date.strftime('%Y-%m-%d'), TTT, LOS)
        # Construct updated URL
        updated_url = f"{base_url}?destination={city.replace(' ', '%20')}&regionId=178293&latLong=40.75668,-73.98647" \
                      f"&d1={checkin_date}&startDate={checkin_date}&d2={checkout_date}&endDate={checkout_date}" \
                      f"&adults={numOfAdults}&rooms={numOfRooms}&isInvalidatedDate=false&useRewards=false&sort=RECOMMENDED"

        print(f"Generated URL: {updated_url}")  # Print the generated URL for debugging

        # Open updated URL
        driver.get(updated_url)
        time.sleep(random.uniform(1, 2))  # Allow page to load

        close_popup_if_exists()  # Close popup if exists

        hotel_count = 0  # Counter to track the number of hotels
        time.sleep(random.uniform(6, 8))

        while hotel_count < 100:
            # Scroll down to load more results
            scroll_distance = random.randint(200, 800)
            driver.execute_script(f"window.scrollBy(0, {scroll_distance});")
            time.sleep(random.uniform(1, 2))

            # Click 'Show more' button if available
            try:
                show_more_button = WebDriverWait(driver, 5).until(
                    EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Show more')]"))
                )
                show_more_button.click()
                time.sleep(random.uniform(1, 2))
            except (TimeoutException, NoSuchElementException):
                pass  # No 'Show more' button available

            # Scrape hotels using XPath
            hotels = driver.find_elements(By.XPATH, '//div[contains(@class, "uitk-card uitk-card-roundcorner-all")]')

            for hotel in hotels:
                if hotel_count >= 100:
                    break

                try:
                    hotel_name = hotel.find_element(By.XPATH,
                                                    './/h3[contains(@class, "uitk-heading uitk-heading-5")]').text
                except:
                    hotel_name = "N/A"

                try:
                    neighborhood = hotel.find_element(By.XPATH,
                                                      './/div[contains(@class, "uitk-text uitk-text-spacing-half")]').text
                except:
                    neighborhood = "N/A"

                try:
                    price_text = hotel.find_element(By.XPATH, './/span[@data-testid="price-and-discounted-price"]').text
                    price = re.sub(r"[^\d]", "", price_text)  # Keep only numbers
                except:
                    price = "N/A"

                try:
                    price_after_taxes_text = hotel.find_element(By.XPATH,
                                                                './/div[@class="uitk-text uitk-type-end uitk-type-200 uitk-text-default-theme"]').text
                    price_after_taxes = re.sub(r"[^\d]", "", price_after_taxes_text)
                except:
                    price_after_taxes = "N/A"

                try:
                    reviews_desc = \
                        hotel.find_element(By.XPATH,
                                           './/span[contains(@class, "uitk-text uitk-type-300")]').text.split()[0]
                except:
                    reviews_desc = "N/A"

                try:
                    num_reviews = \
                        hotel.find_element(By.XPATH,
                                           './/span[contains(@class, "uitk-text uitk-type-200")]').text.split()[0]
                except:
                    num_reviews = "N/A"

                try:
                    breakfast_info = hotel.find_element(By.XPATH,
                                                        './/div[contains(@class, "uitk-text truncate-lines-2 uitk-type-200")]').text
                    breakfast = "Yes" if "Breakfast included" in breakfast_info else "No"
                except:
                    breakfast = "No"

                try:
                    free_cancellation = hotel.find_element(By.XPATH,
                                                           './/span[contains(@class, "uitk-text uitk-type-300 uitk-text-positive-theme")]').text
                    free_cancellation = "Yes" if "Fully refundable" in free_cancellation else "No"
                except:
                    free_cancellation = "No"

                try:
                    rooms_left_text = hotel.find_element(By.XPATH, './/span[contains(@class, "uitk-badge-text")]').text
                    match = re.search(r'(\d+)', rooms_left_text)
                    rooms_left = match.group(1) if match else "N/A"
                except:
                    rooms_left = "N/A"

                # Append data to DataFrame
                if hotel_name != "N/A":
                    new_row = pd.DataFrame([{
                        "Snapshot Date": snapshot_date.strftime("%Y-%m-%d"),
                        "TTT": TTT,
                        "LOS": LOS,
                        "Hotel Name": hotel_name,
                        "Price": price_after_taxes,
                        "Rating": reviews_desc,
                        "Reviews": num_reviews,
                        "Neighborhood": neighborhood,
                        "Breakfast": breakfast,
                        "Free Cancellation": free_cancellation,
                        "Rooms Left": rooms_left,
                        "Check-in": checkin_date,
                        "Check-out": checkout_date
                    }])

                    df = pd.concat([df, new_row], ignore_index=True)

                    hotel_count += 1

                    # Save every 100 hotels
                    if hotel_count % 100 == 0:
                        save_progress()

# Final save before exiting
save_progress()
print(f"Successfully saved {csv_filename}")

# Close WebDriver
driver.quit()


Generated URL: https://www.expedia.com/Hotel-Search?destination=New%20York%20(and%20vicinity),%20New%20York,%20United%20States%20of%20America&regionId=178293&latLong=40.75668,-73.98647&d1=2025-03-19&startDate=2025-03-19&d2=2025-03-20&endDate=2025-03-20&adults=2&rooms=1&isInvalidatedDate=false&useRewards=false&sort=RECOMMENDED
Saved 100 hotels to CSV (Data\expedia_results - 19.03.2025.csv).
Generated URL: https://www.expedia.com/Hotel-Search?destination=New%20York%20(and%20vicinity),%20New%20York,%20United%20States%20of%20America&regionId=178293&latLong=40.75668,-73.98647&d1=2025-03-19&startDate=2025-03-19&d2=2025-03-21&endDate=2025-03-21&adults=2&rooms=1&isInvalidatedDate=false&useRewards=false&sort=RECOMMENDED
Saved 200 hotels to CSV (Data\expedia_results - 19.03.2025.csv).
Generated URL: https://www.expedia.com/Hotel-Search?destination=New%20York%20(and%20vicinity),%20New%20York,%20United%20States%20of%20America&regionId=178293&latLong=40.75668,-73.98647&d1=2025-03-19&startDate=2025-