## Imports

In [None]:
# FOR DATA PROCESSING:
import pandas as pd
import numpy as np

# FOR MEASURING COMPUTATION TIME, CREATING FIXED DELAYS:
import time

# FOR APPLYING BEAUTIFULSOUP
from bs4 import BeautifulSoup

# FOR APPLYING SELENIUM:
import selenium # Python Selenium
from selenium import webdriver # for specifying webdriver

from webdriver_manager.firefox import GeckoDriverManager # geckodriver for automatized access to Firefox

from selenium.webdriver.chrome.service import Service # needed since Selenium 4.10.0 see: https://github.com/SeleniumHQ/selenium/commit/9f5801c82fb3be3d5850707c46c3f8176e3ccd8e

from selenium.webdriver.support.ui import WebDriverWait # this three enable waiting until sth is displayed on website
from selenium.webdriver.support import expected_conditions as EC # for checking visibility of an element
from selenium.webdriver.common.by import By # for checking element visibility by XPath

from selenium.common.exceptions import TimeoutException


# FOR SAVING DATA:
import pickle # pickle format of saved output

# FOR GET TODAYS DATE
from datetime import datetime, timedelta

from urllib.parse import urljoin

def save_object(obj, filename): #  function defined for saving Python objects
    with open(filename, 'wb') as output: # overwrites any existing file
        pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)
        
firefoxpath = GeckoDriverManager().install(); print(firefoxpath)

## Accessing Website

In [None]:
website = "https://www.newzealand.com/int/"

service_firefox = Service(executable_path = firefoxpath) 
options_firefox = webdriver.FirefoxOptions()
driver_firefox = webdriver.Firefox(service = service_firefox, options = options_firefox) # opens Firefox

driver_firefox.maximize_window() # maximizes browser's window
driver_firefox.get(website) # opens a website

## Selenium Automation

In [None]:
website_search = "https://www.newzealand.com/int/"
driver_firefox.get(website_search) # opens a website

start = time.time()
time.sleep(np.random.chisquare(3)+5) # + wait random time drawn from specific (strongly right-side-skewed) distribution to better imitate human behavior

target_button_xpath = "//i[@class='o-icon js-icon search-icon']//*[@class='icon search']"
target_button = WebDriverWait(driver_firefox, 4).until(
    EC.element_to_be_clickable((By.XPATH, target_button_xpath))
)
target_button.click()

# Collecting links for cities

html = driver_firefox.page_source
soup = BeautifulSoup(html, "html.parser")

# Find all group labels
group_labels = soup.find_all("p", class_="popular-searches__group-label")

target_label = None
for label in group_labels:
    if "Popular places to visit" in label.text:
        target_label = label
        break

popular_links = []
if target_label:
    city_list = target_label.find_next_sibling("ul", class_="popular-searches__group-items")
    for link in city_list.find_all("a", class_="popular-searches__group-item"):
        city_name = link.get_text(strip=True)
        href = urljoin(website_search, link["href"])
        popular_links.append((city_name, href))


# Output of scrapped cities
try: # Error handling
    print("Popular Places to Visit in New Zealand:")
    for city, url in popular_links:
        print(f"{city}: {url}")
except Exception as e: # Error handling
    print("Cannot retrieve data")

In [None]:
city_tab_handles = {}

for city, url in popular_links:
    # Open new tab
    driver_firefox.execute_script("window.open();")
    driver_firefox.switch_to.window(driver_firefox.window_handles[-1])
    
    # Load city URL
    driver_firefox.get(url)
    time.sleep(5)

    # Store tab handle
    handle_key = city.lower().replace(" ", "_").replace("/", "_").replace("ō", "o")
    city_tab_handles[handle_key] = driver_firefox.current_window_handle

## Auckland

In [None]:
driver_firefox.switch_to.window(city_tab_handles["auckland"])

time.sleep(5)
# Click on the "Activities" filter
try:
    filter_xpath = "//span[contains(text(),'Activities')]"
    filter_button = WebDriverWait(driver_firefox, 4).until(
        EC.element_to_be_clickable((By.XPATH, filter_xpath))
    )
    filter_button.click()
    print("Activities' filter clicked on Auckland page.")
except Exception as e:
    print(f"Failed to click 'Activities': {e}")

In [None]:
time.sleep(np.random.chisquare(3)+5)
click = 0 
max_clicks = 9
while click < max_clicks:
    try:
        load_more_xpath = '//*[@id="search-results"]/div[2]/div/div[3]/button'
        load_more_button = WebDriverWait(driver_firefox, 5).until(
            EC.element_to_be_clickable((By.XPATH, load_more_xpath))
        )

        # Click the button
        load_more_button.click()
        click += 1
        print("Loading more...")

        # Optional: wait for new content to load
        time.sleep(5)

    except TimeoutException:
        print("All activities loaded (no more button).")
        break


## BeautifulSoup Static Scraping

In [None]:
html = driver_firefox.page_source
soup = BeautifulSoup(html, "html.parser")

# Step 1: Find the result container
results_container = soup.find("div", class_="search-results__results")
activity_blocks = results_container.find_all("div", class_="results__wrapper") if results_container else []

# Step 2: Prepare lists
titles = []
links = []
descriptions = []
images = []

# Step 3: Loop through each block
for activity in activity_blocks:
    try:
        # Title and Link
        title_tag = activity.select_one("h4.results__title a")
        title = title_tag.get_text(strip=True) if title_tag else ""
        link = title_tag["href"] if title_tag and "href" in title_tag.attrs else ""

        # Description
        desc_tag = activity.select_one("p.results__description")
        description = desc_tag.get_text(strip=True) if desc_tag else ""

        # Image
        img_tag = activity.select_one("figure.results__photo img")
        img_url = img_tag["src"] if img_tag and "src" in img_tag.attrs else ""

        # Append
        titles.append(title)
        links.append(link)
        descriptions.append(description)
        images.append(img_url)

    except Exception as e:
        print(f"Skipping block due to: {e}")
        continue


In [None]:
print(len(titles))
print(len(links))
print(len(descriptions))
print(len(images))

In [None]:
descriptions[0:3]

In [None]:
links[0:3]

## Dataframe

In [None]:
df = pd.DataFrame({
    "Events": artist_names,
    "Venue": venues,
    "Location": locations,x
    "Event Dates" : eventdates,
    "Time of Event": time_events,
    "Availability": statuses
})

In [None]:
df

In [None]:
driver_firefox.close() # this closes the webdriver

## Export to Pickle

In [None]:

##