# Homework 1

# Part 1: Scraping

## Loading all information

In [156]:
# Importing Packages
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException, StaleElementReferenceException
from selenium import webdriver
import os
import time
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup


In [144]:
# Preferences

def ffx_preferences(dfolder, download=False, firefox_binary_path=None):
    """
    Sets the preferences of the Firefox browser: download path.
    """
    options = Options()

    # Set the Firefox binary location if provided
    if firefox_binary_path:
        options.binary_location = firefox_binary_path  

    # Set download folder preferences
    options.set_preference("browser.download.dir", dfolder)  # Predefine download folder
    options.set_preference("browser.download.folderList", 2)  # Use custom download folder
    options.set_preference("browser.download.manager.showWhenStarting", False)  # Disable popups
    options.set_preference("browser.helperApps.neverAsk.saveToDisk",
                           "application/msword,application/rtf,application/csv,text/csv,image/png,image/jpeg,application/pdf,text/html,text/plain,application/octet-stream")

    # Enable automatic PDF downloads
    if download:
        options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf,application/x-pdf")
        options.set_preference("pdfjs.disabled", True)  # Disable built-in PDF viewer

    return options

def start_up(link, dfolder, geko_path, firefox_binary_path=None, download=True):
    """
    Initializes the Firefox browser with the given settings.
    """
    os.makedirs(dfolder, exist_ok=True)  # Ensure download directory exists

    options = ffx_preferences(dfolder, download, firefox_binary_path)

    service = Service(executable_path=geko_path)
    browser = webdriver.Firefox(service=service, options=options)
    
    # Open the target link
    browser.get(link)
    time.sleep(5)  # Adjust as needed
    return browser

def check_and_click(browser, xpath, type, timeout=10):
    '''
    This function returns:
    - True if click was successful.
    - False otherwise.
    '''
    try:
        wait = WebDriverWait(browser, timeout)
        
        if type.lower() == "xpath":
            element = wait.until(EC.element_to_be_clickable((By.XPATH, xpath)))
        elif type.lower() == "id":
            element = wait.until(EC.element_to_be_clickable((By.ID, xpath)))
        elif type.lower() == "css":
            element = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, xpath)))
        elif type.lower() == "class":
            element = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, xpath)))
        elif type.lower() == "link":
            element = wait.until(EC.element_to_be_clickable((By.LINK_TEXT, xpath)))
        else:
            print(f"Unsupported locator type: {type}")
            return False
        
        element.click()
        
        return True
    
    except (ElementClickInterceptedException, NoSuchElementException, StaleElementReferenceException) as e:
        print(f"Error clicking element: {e}")
        return False
    except TimeoutException:
        print(f"Timeout: Element not clickable after {timeout} seconds: {xpath}")
        return False
    except Exception as e:
        print(f"Unexpected exception: {e}")
        return False
    
def check_obscures(browser, xpath, type):
    """
    Checks if an element is being obstructed and clicks it if possible.
    """
    try:
        if type == "xpath":
            browser.find_element('xpath', xpath).click()
        elif type == "id":
            browser.find_element('id', xpath).click()
        elif type == "css":
            browser.find_element('css selector', xpath).click()
        elif type == "class":
            browser.find_element('class name', xpath).click()
        elif type == "link":
            browser.find_element('link text', xpath).click()
    except (ElementClickInterceptedException, NoSuchElementException, StaleElementReferenceException) as e:
        print(e)
        return False
    return True

In [145]:
# Opening booking:
firefox_binary_path = r"C:\Program Files\Mozilla Firefox\firefox.exe"
dfolder='./downloads'
geko_path = r"C:\Users\aleja\OneDrive\Escritorio\Term_2\Text_Mining\geckodriver.exe"
link='https://www.booking.com/index.es.html'
browser=start_up(link, dfolder, geko_path, firefox_binary_path)


In [146]:
# Automated language selection

# Language selection
browser.find_element(By.XPATH, '//button[@aria-controls="header_language_picker"]').click()
# Selecting Spanish
browser.find_element(By.XPATH, '//span[@lang="es"]').click()

### Cookie rejection

In [147]:
success = check_and_click(browser, "onetrust-reject-all-handler", "id", timeout=10)

if success:
    print("Cookie consent rejected.")
else:
    print("Failed to reject cookies.")

Cookie consent rejected.


### Place Input

In [148]:
# Finding "where are you going?" Button
time.sleep(3)  # Wait for 3 seconds before clicking
browser.find_element(by=By.XPATH,value='//*[@id=":rh:"]').click()

# Place input
place = 'Barcelona' #switch to Lisboa (spanish for Lisbon)
search1 = browser.find_element(by='xpath',value='//*[@id=":rh:"]')
search1.send_keys(place)

### Date Input

In [149]:
# Clicking the date box

locator = '[data-testid="searchbox-dates-container"]'
type = "css"
success = check_and_click(browser, locator, type, timeout=10)
if success:
    print("Clicked the dates box.")
else:
    print("Failed to click the dates box.")

# Selecting may according to the current month when the code is run

from datetime import datetime

# Get the current month number (1 = January, 2 = February, ..., 12 = December)
current_month = datetime.now().month

# Calculate the number of clicks needed to reach May (month 5)
clicks_needed = max(0, 5 - current_month)

# Click the "Next month" button the required number of times
for _ in range(clicks_needed):
    check_and_click(browser, '//button[@class="a83ed08757 c21c56c305 f38b6daa18 d691166b09 f671049264 f4552b6561 dc72a8413c f073249358"]', "xpath")


Clicked the dates box.


In [150]:

# Selecting the dates
path='//div[@id="calendar-searchboxdatepicker"]//table[@class="eb03f3f27f"]//tbody//td[@class="b80d5adb18"]//span[@class="cf06f772fa ef091eb985"]'
dates = browser.find_elements('xpath',path)

arrival = '29-05-2025' #input day of arrival as day , month
departure = '02-06-2025' # input day of departure as day, month


for date in dates:
    if date.get_attribute("data-date") == f"2025-{arrival.split('-')[1]}-{arrival.split('-')[0]}":
        date.click()
    if date.get_attribute("data-date") == f"2025-{departure.split('-')[1]}-{departure.split('-')[0]}":
        date.click()
        break

### Searching

In [151]:
browser.find_element(by=By.XPATH,value='//button[@class="a83ed08757 c21c56c305 a4c1805887 f671049264 a2abacf76b c082d89982 cceeb8986b b9fd3c6b3c"]').click() 

In [152]:
# Closing Genius Pop Up (if it appears)
try:
    success = check_and_click(browser, '//div[@class="eb33ef7c47"]//button[@class="a83ed08757 c21c56c305 f38b6daa18 d691166b09 ab98298258 f4552b6561"]', "xpath")
    if success:
        print("Pop-up closed.")
except NoSuchElementException:
    print("Pop-up did not appear, continuing...")

Pop-up closed.


In [153]:
# Loading all results

clicks = 0
# Scrolling based on the height of the page.
while True:
    try:
        total_height = browser.execute_script("return document.body.scrollHeight")
        scroll_position = (total_height * 10) - 200 
        #we scroll to the bottom of the page, and then back up a bit to reach the desired button
        browser.execute_script(f"window.scrollTo(0, {scroll_position});")

        # Attempt to click the "Load More" button using check_and_click.
        success = check_and_click(browser, '//button[@class="a83ed08757 c21c56c305 bf0537ecb5 f671049264 af7297d90d c0e0affd09"]', "xpath", timeout=10)
        
        if not success:
            break  # Exit loop if button is not found or not clickable.
        
        clicks += 1
    except Exception as e:
        print(f"Error while scrolling and clicking: {e}")
        break

print(f"The 'Load More' button has been clicked {clicks} times")

Timeout: Element not clickable after 10 seconds: //button[@class="a83ed08757 c21c56c305 bf0537ecb5 f671049264 af7297d90d c0e0affd09"]
The 'Load More' button has been clicked 37 times


## Data Extraction

In [154]:
page_source = browser.page_source
soup = BeautifulSoup(page_source, 'html.parser')

In [157]:
def scrape_hotels_metadata(soup):

    #we will store each hotel metadata here   
    hotel_metadadata = []

    #this is our way to find all hotels from the soup
    hotels = soup.find_all('div', {'data-testid': 'property-card'})
    
    for hotel in hotels:
        # Extract hotel name
        name = hotel.find('div', {'data-testid': 'title'}).get_text(strip=True) if hotel.find('div', {'data-testid': 'title'}) else "NA"
        
        # Extract price
        price_unprocessed = hotel.find('span', {'class': 'f6431b446c fbfd7c1165 e84eb96b1f'}).get_text(strip=True) if hotel.find('span', {'class': 'f6431b446c fbfd7c1165 e84eb96b1f'}) else "NA"
        price = re.sub(r'\D', '', price_unprocessed)
        
        # Extract rating
        rating_box = hotel.find('div', {'data-testid': 'review-score'})
        rating = rating_box.find('div').get_text(strip=True)[-3:].replace(',', '.') if rating_box else "NA"

        # Extract link to detailed hotel page
        hotel_link = hotel.find('a', {'data-testid': 'title-link'})['href'] if hotel.find('a', {'data-testid': 'title-link'}) else None 
                
        hotel_metadadata.append([name, price, rating, hotel_link])
        df = pd.DataFrame(hotel_metadadata, columns=['Name', 'Price', 'Rating', 'Hotel Link'])
    return df

def scrape_hotel_description(detail_url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    try:
        response = requests.get(detail_url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        description = soup.find('p', {'data-testid': 'property-description'}).get_text(strip=True) if soup.find('p', {'data-testid': 'property-description'}) else "NA"
        return description
    except Exception as e:
        print(f"Error scraping details from {detail_url}: {e}")
        return "Error"
    
def full_program(soup):
    #Scrape the listing page using Selenium
    metadata = scrape_hotels_metadata(soup)    
    
    #Scrape each hotel's detailed page for additional information
    descriptions = []
    for index, row in metadata.iterrows():
        detail_url = row['Hotel Link']
        if detail_url:
            description = scrape_hotel_description(detail_url)
            descriptions.append(description)
            time.sleep(2)  # Add delay to avoid getting blocked
        else:
            descriptions.append("NA")

    # Add the descriptions to the DataFrame
    metadata['Description'] = descriptions
    
    # Save the DataFrame to a CSV file
    metadata.to_csv("hotel_data.csv", index=False)
    print("Hotel data saved to hotel_data.csv")

In [158]:
df = full_program(soup)

KeyboardInterrupt: 