In [2]:
!pip install selenium
!pip install beautifulsoup4
!pip install webdriver-manager



In [1]:
import pandas as pd 

df = pd.read_pickle("/Users/ahmed/Documents/ESILV/s9/web scraping/processed_data_webScraping5.pkl")

In [3]:
import csv
import time
import os
import pandas as pd
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import (
    NoSuchElementException,
    ElementClickInterceptedException,
    TimeoutException
)
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

def file_is_empty(path):
    """
    Returns True if the file doesn't exist or is empty.
    This helps us decide whether we should write a CSV header.
    """
    return not os.path.exists(path) or os.stat(path).st_size == 0

def scrape_hotel_images(hotel_url, driver=None, wait=None):
    """
    Given a single hotel URL, this function:
      1) Loads the page
      2) Scrapes images
      3) Returns a list of image URLs
    """
    images = []

    # We allow passing in driver & wait if we manage them at a higher level.
    # Otherwise, we instantiate a local driver (and close it in the finally block).
    local_driver_used = False
    if driver is None:
        local_driver_used = True
        driver = webdriver.Chrome()
        wait = WebDriverWait(driver, 15)

    try:
        driver.get(hotel_url)
        time.sleep(3)  # Let page load (consider explicit waits in production)
        try:
            cookie_btn = wait.until(EC.element_to_be_clickable(
                (By.CSS_SELECTOR, "button#onetrust-reject-all-handler")
            ))
            cookie_btn.click()
            print("Rejected cookies.")
        except:
            print("No cookie popup found.")

        # Scrape images
        try:
            image_elements = driver.find_elements(By.CSS_SELECTOR, "img.e3fa9175ee.d354f8f44f.ba6d792fd4.b1a5e281e7")
            for img in image_elements:
                img_url = img.get_attribute("src")
                if img_url:
                    images.append(img_url)
            print(f"Found {len(images)} images.")
        except NoSuchElementException:
            print("No images found.")

    except Exception as e:
        # If there's any exception we consider fatal, we re-raise it
        raise e
    finally:
        # If we used a local driver, quit it
        if local_driver_used and driver is not None:
            driver.quit()

    return images

def main(df):
    output_csv = "hotel_images.csv"
    failure_csv = "hotel_image_scrape_failures.csv"

    # STEP 1: Collect the hotel links we want to scrape
    hotel_links = df['HotelLink'].dropna().tolist()

    print(f"Found {len(hotel_links)} hotel links to process.")

    # STEP 2: Prepare for writing results
    fieldnames = ["url", "images"]

    # We will open one browser instance and reuse it to be more efficient
    driver = webdriver.Chrome()
    wait = WebDriverWait(driver, 15)

    try:
        # We open output CSV in append mode
        with open(output_csv, "a", newline="", encoding="utf-8") as f_out:
            writer = csv.DictWriter(f_out, fieldnames=fieldnames)
            # If file is empty, write the header
            if file_is_empty(output_csv):
                writer.writeheader()

            # We'll also open the failure CSV in append mode
            with open(failure_csv, "a", newline="", encoding="utf-8") as f_fail:
                failure_writer = csv.writer(f_fail)
                # If the file is empty, write a simple header
                if file_is_empty(failure_csv):
                    failure_writer.writerow(["url", "reason"])

                # STEP 3: Loop over all links and scrape images
                for url in hotel_links:
                    print(f"Scraping images for: {url}")
                    try:
                        images = scrape_hotel_images(url, driver, wait)
                        # If successful, write to the output CSV immediately
                        row = {
                            "url": url,
                            "images": "\n".join(images)
                        }
                        writer.writerow(row)
                        print(f"Successfully scraped images for: {url}")
                    except Exception as e:
                        # If there's an error, store the URL in the failures CSV with the error message
                        print(f"Error scraping images for {url}: {e}")
                        failure_writer.writerow([url, str(e)])
                        # Optionally continue to the next URL
                        continue

    finally:
        # Close the shared driver after finishing all
        driver.quit()

    print("Image scraping completed.")

if __name__ == "__main__":
    # Load your DataFrame here
    df = pd.read_pickle("/Users/ahmed/Documents/ESILV/s9/web scraping/processed_data_webScraping5.pkl")
    main(df)


Found 154 hotel links to process.
Scraping images for: https://www.booking.com/hotel/fr/mercure-terminus-est.fr.html?aid=304142&label=gen173nr-1FCAQoggJCDHNlYXJjaF9wYXJpc0gzWARoTYgBAZgBDbgBB8gBDNgBAegBAfgBA4gCAagCA7gC96enuwbAAgHSAiQ5ZjEyZmI3YS05ZmIwLTQ3ZmYtYjM4My0yZGZmNDVmZTA5YTHYAgXgAgE&ucfs=1&arphpl=1&group_adults=2&req_adults=2&no_rooms=1&group_children=0&req_children=0&hpos=1&hapos=1&sr_order=popularity&nflt=SustainablePropertyLevelFilter%3D4&srpvid=524095fbb1e5017d&srepoch=1734988838&from_sustainable_property_sr=1&from=searchresults
Rejected cookies.
Found 9 images.
Successfully scraped images for: https://www.booking.com/hotel/fr/mercure-terminus-est.fr.html?aid=304142&label=gen173nr-1FCAQoggJCDHNlYXJjaF9wYXJpc0gzWARoTYgBAZgBDbgBB8gBDNgBAegBAfgBA4gCAagCA7gC96enuwbAAgHSAiQ5ZjEyZmI3YS05ZmIwLTQ3ZmYtYjM4My0yZGZmNDVmZTA5YTHYAgXgAgE&ucfs=1&arphpl=1&group_adults=2&req_adults=2&no_rooms=1&group_children=0&req_children=0&hpos=1&hapos=1&sr_order=popularity&nflt=SustainablePropertyLevelFilt

In [4]:
df_images = pd.read_csv("/Users/ahmed/Documents/ESILV/s9/web scraping/ecostay/ecostay/web_scraping/hotel_images.csv")

In [6]:
print(df_images["images"].iloc[0])

https://cf.bstatic.com/xdata/images/hotel/max1024x768/621671008.jpg?k=dba8051bf1508f2975f010b10df313a61052bdac8fae0c0a423bdac138541667&o=
https://cf.bstatic.com/xdata/images/hotel/max500/621671010.jpg?k=0e7e48b3cbe3587e152e29bf40772c800aca7545fdffda9c3daa47068192c5de&o=
https://cf.bstatic.com/xdata/images/hotel/max500/621671020.jpg?k=4531a8c7d11e2e512fcac1915083928233c041e2ea3fd8e62068e5e5d4911cdc&o=
https://cf.bstatic.com/xdata/images/hotel/max300/580039967.jpg?k=575e38771f4316d5dab0558298d40873f88bc1915f55723a862b58f318a46669&o=
https://cf.bstatic.com/xdata/images/hotel/max300/580039965.jpg?k=f46da94b0afcdac30b5f1f16ec9b8356f695dbd1bcc51dfac05353b1c68aca1b&o=
https://cf.bstatic.com/xdata/images/hotel/max300/580040026.jpg?k=2847c80f3ef14932f2849fdbf4127543cf921e59fee1b8b15ca6a650b5906942&o=
https://cf.bstatic.com/xdata/images/hotel/max300/580040056.jpg?k=7ae9f267289f9d100c11591bbb059324b449e98886455216902ba3d36f8cfec3&o=
https://cf.bstatic.com/xdata/images/hotel/max300/580040085.jpg?k