# Popular Tourist Destinations in New Zealand
*A Webscrapping Project on Dynamic and Static Webpages*
* I Putu Agastya Harta Pratama
* Łukasz Brzoska

Faculty of Economic Sciences <br>
University of Warsaw <br>
Warsaw, Poland <br>
2025

## Imports

In [1]:
# FOR DATA PROCESSING:
import pandas as pd
import numpy as np

# FOR MEASURING COMPUTATION TIME, CREATING FIXED DELAYS:
import time

# FOR APPLYING BEAUTIFULSOUP
from bs4 import BeautifulSoup

# FOR APPLYING SELENIUM:
import selenium 
from selenium import webdriver 
from webdriver_manager.firefox import GeckoDriverManager 
from selenium.webdriver.chrome.service import Service 
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.webdriver.support import expected_conditions as EC 
from selenium.webdriver.common.by import By 
from selenium.common.exceptions import TimeoutException

# FOR SAVING DATA:
import pickle # pickle format of saved output

# FOR URL PARSING:
from urllib.parse import urljoin

In [2]:
def save_object(obj, filename): 
    with open(filename, 'wb') as output:
        pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)
        
firefoxpath = GeckoDriverManager().install(); print("Driver Installed at: ", firefoxpath)

Driver Installed at:  /Users/agastyaharta/.wdm/drivers/geckodriver/mac64/v0.36.0/geckodriver


## Accessing Website

In [3]:
website = "https://www.newzealand.com/int/"

service_firefox = Service(executable_path = firefoxpath) 
options_firefox = webdriver.FirefoxOptions()
driver_firefox = webdriver.Firefox(service = service_firefox, options = options_firefox) 

driver_firefox.maximize_window()
driver_firefox.get(website)

## Selenium Automation

In [4]:
website_search = "https://www.newzealand.com/int/"
driver_firefox.get(website_search) 

start = time.time()
time.sleep(np.random.chisquare(3)+5) # + wait random time drawn from specific (strongly right-side-skewed) distribution to better imitate human behavior

target_button_xpath = "//i[@class='o-icon js-icon search-icon']//*[@class='icon search']"
target_button = WebDriverWait(driver_firefox, 4).until(
    EC.element_to_be_clickable((By.XPATH, target_button_xpath))
)
target_button.click()

html = driver_firefox.page_source
soup = BeautifulSoup(html, "html.parser")

# Find all group labels
group_labels = soup.find_all("p", class_="popular-searches__group-label")

target_label = None
for label in group_labels:
    if "Popular places to visit" in label.text:
        target_label = label
        break

popular_links = []
if target_label:
    city_list = target_label.find_next_sibling("ul", class_="popular-searches__group-items")
    for link in city_list.find_all("a", class_="popular-searches__group-item"):
        city_name = link.get_text(strip=True)
        href = urljoin(website_search, link["href"])
        popular_links.append((city_name, href))


# Output of scrapped cities
try: # Error handling
    print("Popular Places to Visit in New Zealand:")
    for city, url in popular_links:
        print(f"{city}: {url}")
except Exception as e: # Error handling
    print("Cannot retrieve data")

Popular Places to Visit in New Zealand:
Auckland: https://www.newzealand.com/int/utilities/search/?q=Auckland&type=popular
Queenstown: https://www.newzealand.com/int/utilities/search/?q=Queenstown&type=popular
Lake Tekapo / Takapō: https://www.newzealand.com/int/utilities/search/?q=Lake+Tekapo+%2F+Takap%C5%8D&type=popular
Wānaka: https://www.newzealand.com/int/utilities/search/?q=W%C4%81naka&type=popular


In [5]:
city_tab_handles = {}

for city, url in popular_links:
    # Open new tab
    driver_firefox.execute_script("window.open();")
    driver_firefox.switch_to.window(driver_firefox.window_handles[-1])
    
    # Load city URL
    driver_firefox.get(url)
    time.sleep(5)

    # Store tab handle
    handle_key = city.lower().replace(" ", "_").replace("/", "_").replace("ō", "o")
    city_tab_handles[handle_key] = driver_firefox.current_window_handle

In [39]:
city_tab_handles

{'auckland': 'd5c0ebca-863f-4016-9509-c4cb3f51ab84',
 'queenstown': 'bf91451e-26fb-4dfc-b494-935806cd8d6f',
 'lake_tekapo___takapo': '3f27098f-e482-41bb-b51b-fca4a04bb772',
 'wānaka': 'af0442c7-d08c-4281-8964-2230d41e6e49'}

## Auckland

In [6]:
driver_firefox.switch_to.window(city_tab_handles["auckland"])

time.sleep(5)
# Click on the "Activities" filter
try:
    filter_xpath = "//span[contains(text(),'Activities')]"
    filter_button = WebDriverWait(driver_firefox, 4).until(
        EC.element_to_be_clickable((By.XPATH, filter_xpath))
    )
    filter_button.click()
    print("Activities' filter clicked on Auckland page.")
except Exception as e:
    print(f"Failed to click 'Activities': {e}")

Activities' filter clicked on Auckland page.


In [7]:
time.sleep(np.random.chisquare(3)+5)
click = 0 
max_clicks = 4
while click < max_clicks:
    try:
        load_more_xpath = '//*[@id="search-results"]/div[2]/div/div[3]/button'
        load_more_button = WebDriverWait(driver_firefox, 5).until(
            EC.element_to_be_clickable((By.XPATH, load_more_xpath))
        )

        # Click the button
        load_more_button.click()
        click += 1
        print("Loading more...")

        # Optional: wait for new content to load
        time.sleep(5)

    except TimeoutException:
        print("All activities loaded (no more button).")
        break


Loading more...
Loading more...
Loading more...
Loading more...


### Data Scraping - Auckland

In [8]:
html = driver_firefox.page_source
soup = BeautifulSoup(html, "html.parser")

results_container = soup.find("div", class_="search-results__results")
activity_blocks = results_container.find_all("div", class_="results__wrapper") if results_container else []

titles_auckland = []
links_auckland= []
descriptions_auckland = []
images_auckland = []


for activity in activity_blocks:
    try:
        # Title
        title_path = activity.select_one("h4.results__title a")
        title = title_path.get_text(strip=True) if title_path else ""
        
        # Link
        link = title_path["href"] if title_path and "href" in title_path.attrs else ""

        # Description
        desc_path = activity.select_one("p.results__description")
        description = desc_path.get_text(strip=True) if desc_path else ""

        # Image
        img_path = activity.select_one("figure.results__photo img")
        img_url = img_path["src"] if img_path and "src" in img_path.attrs else ""

        # Append All
        titles_auckland.append(title)
        links_auckland.append(link)
        descriptions_auckland.append(description)
        images_auckland.append(img_url)

    except Exception as e:
        print(f"Skipping block due to: {e}")
        continue

In [11]:
street_addresses_auckland = []
localities_auckland = []
emails_auckland = []
phone_numbers_auckland = []

for idx, url in enumerate(links_auckland):
    try:
        driver_firefox.get(url)
        WebDriverWait(driver_firefox, 5).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "p[itemtype='http://schema.org/LocalBusiness']"))
        )

        detail_soup = BeautifulSoup(driver_firefox.page_source, "html.parser")
        address_block = detail_soup.select_one("p[itemtype='http://schema.org/LocalBusiness']")

        # Street
        street_path = address_block.select_one("span[itemprop='streetAddress']")
        street_text = street_path.get_text(strip=True) if street_path else ""
        
        # Locality
        locality_path = address_block.select_one("span[itemprop='addressLocality']")
        locality_text = locality_path.get_text(strip=True) if locality_path else ""
        
        # Phone
        phone_path = driver_firefox.find_elements(By.CSS_SELECTOR, "a.js-phone-link")
        phone_number = phone_path[0].get_attribute("href").replace("tel:", "").strip() if phone_path else ""
        
        # Email
        email_tag = driver_firefox.find_elements(By.CSS_SELECTOR, "a[href^='mailto:']")
        email = email_tag[0].get_attribute("href").replace("mailto:", "").strip() if email_tag else ""

    except Exception as e:
        print(f"{idx+1}. Failed to extract data for: {links_auckland[idx]} — {e}")
        street_text = ""
        locality_text = ""

    street_addresses_auckland.append(street_text)
    localities_auckland.append(locality_text)
    emails_auckland.append(email)
    phone_numbers_auckland.append(phone_number)
    
    wait_time = np.random.chisquare(3) + 1
    print(f"Sleeping for {wait_time:.2f} seconds...")
    time.sleep(wait_time)

Sleeping for 4.69 seconds...
Sleeping for 8.30 seconds...
Sleeping for 4.17 seconds...
Sleeping for 1.99 seconds...
Sleeping for 1.22 seconds...
Sleeping for 3.12 seconds...
Sleeping for 7.97 seconds...
Sleeping for 4.51 seconds...
Sleeping for 1.49 seconds...
Sleeping for 2.41 seconds...
Sleeping for 8.45 seconds...
Sleeping for 9.90 seconds...
Sleeping for 1.53 seconds...
Sleeping for 1.82 seconds...
Sleeping for 1.88 seconds...
Sleeping for 3.49 seconds...
Sleeping for 2.08 seconds...
Sleeping for 9.47 seconds...
Sleeping for 6.24 seconds...
Sleeping for 4.34 seconds...
Sleeping for 1.48 seconds...
Sleeping for 10.18 seconds...
Sleeping for 2.18 seconds...
Sleeping for 3.68 seconds...
Sleeping for 5.82 seconds...
Sleeping for 2.71 seconds...
Sleeping for 7.35 seconds...
Sleeping for 7.99 seconds...
Sleeping for 1.10 seconds...
Sleeping for 2.82 seconds...
Sleeping for 1.78 seconds...
Sleeping for 1.15 seconds...
Sleeping for 4.79 seconds...
Sleeping for 5.25 seconds...
Sleeping for 

### Final Check of Auckland Scrapped Lists

In [17]:
auckland_scrapped_lists = [
    titles_auckland,
    links_auckland,
    descriptions_auckland,
    images_auckland,
    street_addresses_auckland,
    localities_auckland,
    emails_auckland,
    phone_numbers_auckland,
]

list_names = [
    "titles_auckland",
    "links_auckland",
    "descriptions_auckland",
    "images_auckland",
    "street_addresses_auckland",
    "localities_auckland",
    "emails_auckland",
    "phone_numbers_auckland",
]

for i, name in enumerate(auckland_scrapped_lists):
    print(f"List length of {list_names[i]}: {len(name)}")

List length of titles_auckland: 50
List length of links_auckland: 50
List length of descriptions_auckland: 50
List length of images_auckland: 50
List length of street_addresses_auckland: 50
List length of localities_auckland: 50
List length of emails_auckland: 50
List length of phone_numbers_auckland: 50


## Queenstown

In [20]:
driver_firefox.switch_to.window(city_tab_handles["queenstown"])

time.sleep(5)
# Click on the "Activities" filter
try:
    filter_xpath = "//span[contains(text(),'Activities')]"
    filter_button = WebDriverWait(driver_firefox, 4).until(
        EC.element_to_be_clickable((By.XPATH, filter_xpath))
    )
    filter_button.click()
    print("Activities' filter clicked on Queenstown page.")
except Exception as e:
    print(f"Failed to click 'Activities': {e}")

Activities' filter clicked on Queenstown page.


In [21]:
time.sleep(np.random.chisquare(3)+5)
click = 0 
max_clicks = 4
while click < max_clicks:
    try:
        load_more_xpath = '//*[@id="search-results"]/div[2]/div/div[3]/button'
        load_more_button = WebDriverWait(driver_firefox, 5).until(
            EC.element_to_be_clickable((By.XPATH, load_more_xpath))
        )

        # Click the button
        load_more_button.click()
        click += 1
        print("Loading more...")

        # Optional: wait for new content to load
        time.sleep(5)

    except TimeoutException:
        print("All activities loaded (no more button).")
        break

Loading more...
Loading more...
Loading more...
Loading more...


### Data Scraping - Queenstown

In [22]:
html = driver_firefox.page_source
soup = BeautifulSoup(html, "html.parser")

results_container = soup.find("div", class_="search-results__results")
activity_blocks = results_container.find_all("div", class_="results__wrapper") if results_container else []

titles_queenstown = []
links_queenstown= []
descriptions_queenstown = []
images_queenstown = []

for activity in activity_blocks:
    try:
        # Title
        title_path = activity.select_one("h4.results__title a")
        title = title_path.get_text(strip=True) if title_path else ""
        
        # Link
        link = title_path["href"] if title_path and "href" in title_path.attrs else ""

        # Description
        desc_path = activity.select_one("p.results__description")
        description = desc_path.get_text(strip=True) if desc_path else ""

        # Image
        img_path = activity.select_one("figure.results__photo img")
        img_url = img_path["src"] if img_path and "src" in img_path.attrs else ""

        # Append All
        titles_queenstown.append(title)
        links_queenstown.append(link)
        descriptions_queenstown.append(description)
        images_queenstown.append(img_url)

    except Exception as e:
        print(f"Skipping block due to: {e}")
        continue

In [26]:
street_addresses_queenstown = []
localities_queenstown = []
emails_queenstown = []
phone_numbers_queenstown = []

for idx, url in enumerate(links_queenstown):
    try:
        driver_firefox.get(url)
        WebDriverWait(driver_firefox, 5).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "p[itemtype='http://schema.org/LocalBusiness']"))
        )

        detail_soup = BeautifulSoup(driver_firefox.page_source, "html.parser")
        address_block = detail_soup.select_one("p[itemtype='http://schema.org/LocalBusiness']")

        # Street
        street_path = address_block.select_one("span[itemprop='streetAddress']")
        street_text = street_path.get_text(strip=True) if street_path else ""
        
        # Locality
        locality_path = address_block.select_one("span[itemprop='addressLocality']")
        locality_text = locality_path.get_text(strip=True) if locality_path else ""
        
        # Phone
        phone_path = driver_firefox.find_elements(By.CSS_SELECTOR, "a.js-phone-link")
        phone_number = phone_path[0].get_attribute("href").replace("tel:", "").strip() if phone_path else ""
        
        # Email
        email_tag = driver_firefox.find_elements(By.CSS_SELECTOR, "a[href^='mailto:']")
        email = email_tag[0].get_attribute("href").replace("mailto:", "").strip() if email_tag else ""

    except Exception as e:
        print(f"{idx+1}. Failed to extract data for: {links_queenstown[idx]} — {e}")
        street_text = ""
        locality_text = ""

    street_addresses_queenstown.append(street_text)
    localities_queenstown.append(locality_text)
    emails_queenstown.append(email)
    phone_numbers_queenstown.append(phone_number)
    
    wait_time = np.random.chisquare(3) + 2
    print(f"Sleeping for {wait_time:.2f} seconds...")
    time.sleep(wait_time)

Sleeping for 6.15 seconds...
Sleeping for 2.32 seconds...
Sleeping for 6.07 seconds...
Sleeping for 5.42 seconds...
Sleeping for 4.00 seconds...
Sleeping for 2.30 seconds...
Sleeping for 3.48 seconds...
Sleeping for 9.79 seconds...
Sleeping for 4.30 seconds...
Sleeping for 9.05 seconds...
Sleeping for 6.61 seconds...
Sleeping for 5.49 seconds...
Sleeping for 2.42 seconds...
Sleeping for 5.29 seconds...
Sleeping for 2.87 seconds...
Sleeping for 7.15 seconds...
Sleeping for 2.37 seconds...
Sleeping for 2.77 seconds...
Sleeping for 4.56 seconds...
Sleeping for 5.26 seconds...
Sleeping for 2.57 seconds...
Sleeping for 2.22 seconds...
Sleeping for 3.19 seconds...
Sleeping for 4.97 seconds...
Sleeping for 4.95 seconds...
Sleeping for 3.51 seconds...
Sleeping for 3.26 seconds...
Sleeping for 4.52 seconds...
Sleeping for 5.99 seconds...
Sleeping for 5.85 seconds...
Sleeping for 4.93 seconds...
Sleeping for 3.42 seconds...
Sleeping for 3.08 seconds...
Sleeping for 4.95 seconds...
Sleeping for 4

### Final Check of Queenstown Scrapped Lists

In [31]:
queenstown_scrapped_lists = [
    titles_queenstown,
    links_queenstown,
    descriptions_queenstown,
    images_queenstown,
    street_addresses_queenstown,
    localities_queenstown,
    emails_queenstown,
    phone_numbers_queenstown,
]

list_names = [
    "titles_queenstown",
    "links_queenstown",
    "descriptions_queenstown",
    "images_queenstown",
    "street_addresses_queenstown",
    "localities_queenstown",
    "emails_queenstown",
    "phone_numbers_queenstown",
]

for i, name in enumerate(queenstown_scrapped_lists):
    print(f"List length of {list_names[i]}: {len(name)}")

List length of titles_queenstown: 50
List length of links_queenstown: 50
List length of descriptions_queenstown: 50
List length of images_queenstown: 50
List length of street_addresses_queenstown: 50
List length of localities_queenstown: 50
List length of emails_queenstown: 50
List length of phone_numbers_queenstown: 50


## Lake Tekapo

In [43]:
driver_firefox.switch_to.window(city_tab_handles["lake_tekapo___takapo"])

time.sleep(5)
# Click on the "Activities" filter
try:
    filter_xpath = "//span[contains(text(),'Activities')]"
    filter_button = WebDriverWait(driver_firefox, 4).until(
        EC.element_to_be_clickable((By.XPATH, filter_xpath))
    )
    filter_button.click()
    print("Activities' filter clicked on Lake Tekapo / Takapō page.")
except Exception as e:
    print(f"Failed to click 'Activities': {e}")

Activities' filter clicked on Lake Tekapo / Takapō page.


In [44]:
time.sleep(np.random.chisquare(3)+5)
click = 0 
max_clicks = 4
while click < max_clicks:
    try:
        load_more_xpath = '//*[@id="search-results"]/div[2]/div/div[3]/button'
        load_more_button = WebDriverWait(driver_firefox, 5).until(
            EC.element_to_be_clickable((By.XPATH, load_more_xpath))
        )

        # Click the button
        load_more_button.click()
        click += 1
        print("Loading more...")

        # Optional: wait for new content to load
        time.sleep(5)

    except TimeoutException:
        print("All activities loaded (no more button).")
        break

Loading more...
Loading more...
Loading more...
Loading more...


### Data Scraping - Lake Tekapo

In [45]:
html = driver_firefox.page_source
soup = BeautifulSoup(html, "html.parser")

results_container = soup.find("div", class_="search-results__results")
activity_blocks = results_container.find_all("div", class_="results__wrapper") if results_container else []

titles_tekapo = []
links_tekapo= []
descriptions_tekapo = []
images_tekapo = []

for activity in activity_blocks:
    try:
        # Title
        title_path = activity.select_one("h4.results__title a")
        title = title_path.get_text(strip=True) if title_path else ""
        
        # Link
        link = title_path["href"] if title_path and "href" in title_path.attrs else ""

        # Description
        desc_path = activity.select_one("p.results__description")
        description = desc_path.get_text(strip=True) if desc_path else ""

        # Image
        img_path = activity.select_one("figure.results__photo img")
        img_url = img_path["src"] if img_path and "src" in img_path.attrs else ""

        # Append All
        titles_tekapo.append(title)
        links_tekapo.append(link)
        descriptions_tekapo.append(description)
        images_tekapo.append(img_url)

    except Exception as e:
        print(f"Skipping block due to: {e}")
        continue

In [46]:
street_addresses_tekapo = []
localities_tekapo = []
emails_tekapo = []
phone_numbers_tekapo = []

for idx, url in enumerate(links_tekapo):
    try:
        driver_firefox.get(url)
        WebDriverWait(driver_firefox, 5).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "p[itemtype='http://schema.org/LocalBusiness']"))
        )

        detail_soup = BeautifulSoup(driver_firefox.page_source, "html.parser")
        address_block = detail_soup.select_one("p[itemtype='http://schema.org/LocalBusiness']")

        # Street
        street_path = address_block.select_one("span[itemprop='streetAddress']")
        street_text = street_path.get_text(strip=True) if street_path else ""
        
        # Locality
        locality_path = address_block.select_one("span[itemprop='addressLocality']")
        locality_text = locality_path.get_text(strip=True) if locality_path else ""
        
        # Phone
        phone_path = driver_firefox.find_elements(By.CSS_SELECTOR, "a.js-phone-link")
        phone_number = phone_path[0].get_attribute("href").replace("tel:", "").strip() if phone_path else ""
        
        # Email
        email_tag = driver_firefox.find_elements(By.CSS_SELECTOR, "a[href^='mailto:']")
        email = email_tag[0].get_attribute("href").replace("mailto:", "").strip() if email_tag else ""

    except Exception as e:
        print(f"{idx+1}. Failed to extract data for: {links_tekapo[idx]} — {e}")
        street_text = ""
        locality_text = ""

    street_addresses_tekapo.append(street_text)
    localities_tekapo.append(locality_text)
    emails_tekapo.append(email)
    phone_numbers_tekapo.append(phone_number)
    
    wait_time = np.random.chisquare(3) + 2
    print(f"Sleeping for {wait_time:.2f} seconds...")
    time.sleep(wait_time)

Sleeping for 3.10 seconds...
Sleeping for 2.94 seconds...
Sleeping for 3.75 seconds...
Sleeping for 5.87 seconds...
Sleeping for 11.91 seconds...
Sleeping for 4.71 seconds...
Sleeping for 3.22 seconds...
Sleeping for 6.76 seconds...
Sleeping for 6.75 seconds...
Sleeping for 2.98 seconds...
Sleeping for 9.79 seconds...
Sleeping for 5.01 seconds...
Sleeping for 6.40 seconds...
Sleeping for 6.36 seconds...
Sleeping for 3.28 seconds...
Sleeping for 4.81 seconds...
Sleeping for 4.83 seconds...
Sleeping for 6.86 seconds...
Sleeping for 5.96 seconds...
Sleeping for 6.41 seconds...
Sleeping for 6.59 seconds...
Sleeping for 3.58 seconds...
Sleeping for 2.44 seconds...
Sleeping for 5.58 seconds...
Sleeping for 3.55 seconds...
Sleeping for 2.08 seconds...
Sleeping for 5.19 seconds...
Sleeping for 4.46 seconds...
Sleeping for 6.11 seconds...
Sleeping for 4.52 seconds...
Sleeping for 4.36 seconds...
Sleeping for 3.29 seconds...
Sleeping for 4.53 seconds...
Sleeping for 4.46 seconds...
Sleeping for 

### Final Check of Lake Tekapo Scrapped Lists

In [47]:
tekapo_scrapped_lists = [
    titles_tekapo,
    links_tekapo,
    descriptions_tekapo,
    images_tekapo,
    street_addresses_tekapo,
    localities_tekapo,
    emails_tekapo,
    phone_numbers_tekapo,
]

list_names = [
    "titles_tekapo",
    "links_tekapo",
    "descriptions_tekapo",
    "images_tekapo",
    "street_addresses_tekapo",
    "localities_tekapo",
    "emails_tekapo",
    "phone_numbers_tekapo",
]

for i, name in enumerate(tekapo_scrapped_lists):
    print(f"List length of {list_names[i]}: {len(name)}")

List length of titles_tekapo: 50
List length of links_tekapo: 50
List length of descriptions_tekapo: 50
List length of images_tekapo: 50
List length of street_addresses_tekapo: 50
List length of localities_tekapo: 50
List length of emails_tekapo: 50
List length of phone_numbers_tekapo: 50


## Wanaka

In [52]:
driver_firefox.switch_to.window(city_tab_handles["wānaka"])

time.sleep(5)
# Click on the "Activities" filter
try:
    filter_xpath = "//span[contains(text(),'Activities')]"
    filter_button = WebDriverWait(driver_firefox, 4).until(
        EC.element_to_be_clickable((By.XPATH, filter_xpath))
    )
    filter_button.click()
    print("Activities' filter clicked on Wanaka page.")
except Exception as e:
    print(f"Failed to click 'Activities': {e}")

Activities' filter clicked on Wanaka page.


In [53]:
time.sleep(np.random.chisquare(3)+5)
click = 0 
max_clicks = 4
while click < max_clicks:
    try:
        load_more_xpath = '//*[@id="search-results"]/div[2]/div/div[3]/button'
        load_more_button = WebDriverWait(driver_firefox, 5).until(
            EC.element_to_be_clickable((By.XPATH, load_more_xpath))
        )

        # Click the button
        load_more_button.click()
        click += 1
        print("Loading more...")

        # Optional: wait for new content to load
        time.sleep(5)

    except TimeoutException:
        print("All activities loaded (no more button).")
        break

Loading more...
Loading more...
Loading more...
Loading more...


### Data Scraping - Wanaka

In [54]:
html = driver_firefox.page_source
soup = BeautifulSoup(html, "html.parser")

results_container = soup.find("div", class_="search-results__results")
activity_blocks = results_container.find_all("div", class_="results__wrapper") if results_container else []

titles_wanaka = []
links_wanaka= []
descriptions_wanaka = []
images_wanaka = []

for activity in activity_blocks:
    try:
        # Title
        title_path = activity.select_one("h4.results__title a")
        title = title_path.get_text(strip=True) if title_path else ""
        
        # Link
        link = title_path["href"] if title_path and "href" in title_path.attrs else ""

        # Description
        desc_path = activity.select_one("p.results__description")
        description = desc_path.get_text(strip=True) if desc_path else ""

        # Image
        img_path = activity.select_one("figure.results__photo img")
        img_url = img_path["src"] if img_path and "src" in img_path.attrs else ""

        # Append All
        titles_wanaka.append(title)
        links_wanaka.append(link)
        descriptions_wanaka.append(description)
        images_wanaka.append(img_url)

    except Exception as e:
        print(f"Skipping block due to: {e}")
        continue

In [56]:
street_addresses_wanaka = []
localities_wanaka = []
emails_wanaka = []
phone_numbers_wanaka = []

for idx, url in enumerate(links_wanaka):
    try:
        driver_firefox.get(url)
        WebDriverWait(driver_firefox, 5).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "p[itemtype='http://schema.org/LocalBusiness']"))
        )

        detail_soup = BeautifulSoup(driver_firefox.page_source, "html.parser")
        address_block = detail_soup.select_one("p[itemtype='http://schema.org/LocalBusiness']")

        # Street
        street_path = address_block.select_one("span[itemprop='streetAddress']")
        street_text = street_path.get_text(strip=True) if street_path else ""
        
        # Locality
        locality_path = address_block.select_one("span[itemprop='addressLocality']")
        locality_text = locality_path.get_text(strip=True) if locality_path else ""
        
        # Phone
        phone_path = driver_firefox.find_elements(By.CSS_SELECTOR, "a.js-phone-link")
        phone_number = phone_path[0].get_attribute("href").replace("tel:", "").strip() if phone_path else ""
        
        # Email
        email_tag = driver_firefox.find_elements(By.CSS_SELECTOR, "a[href^='mailto:']")
        email = email_tag[0].get_attribute("href").replace("mailto:", "").strip() if email_tag else ""

    except Exception as e:
        print(f"{idx+1}. Failed to extract data for: {links_wanaka[idx]} — {e}")
        street_text = ""
        locality_text = ""

    street_addresses_wanaka.append(street_text)
    localities_wanaka.append(locality_text)
    emails_wanaka.append(email)
    phone_numbers_wanaka.append(phone_number)
    
    wait_time = np.random.chisquare(3) + 2
    print(f"Sleeping for {wait_time:.2f} seconds...")
    time.sleep(wait_time)

Sleeping for 3.15 seconds...
Sleeping for 11.47 seconds...
Sleeping for 12.61 seconds...
Sleeping for 8.65 seconds...
Sleeping for 3.16 seconds...
Sleeping for 5.91 seconds...
Sleeping for 8.15 seconds...
Sleeping for 4.58 seconds...
Sleeping for 2.48 seconds...
Sleeping for 3.06 seconds...
Sleeping for 4.72 seconds...
Sleeping for 3.08 seconds...
Sleeping for 8.23 seconds...
Sleeping for 4.57 seconds...
Sleeping for 8.22 seconds...
Sleeping for 4.31 seconds...
Sleeping for 4.11 seconds...
Sleeping for 5.80 seconds...
Sleeping for 3.57 seconds...
Sleeping for 4.09 seconds...
Sleeping for 4.08 seconds...
Sleeping for 3.24 seconds...
Sleeping for 5.51 seconds...
Sleeping for 5.75 seconds...
Sleeping for 2.48 seconds...
Sleeping for 5.44 seconds...
Sleeping for 3.19 seconds...
Sleeping for 6.39 seconds...
Sleeping for 3.00 seconds...
Sleeping for 6.23 seconds...
Sleeping for 3.97 seconds...
Sleeping for 4.63 seconds...
Sleeping for 5.97 seconds...
Sleeping for 5.50 seconds...
Sleeping for

### Final Check of Wanaka Scrapped Lists

In [57]:
wanaka_scrapped_lists = [
    titles_wanaka,
    links_wanaka,
    descriptions_wanaka,
    images_wanaka,
    street_addresses_wanaka,
    localities_wanaka,
    emails_wanaka,
    phone_numbers_wanaka,
]

list_names = [
    "titles_wanaka",
    "links_wanaka",
    "descriptions_wanaka",
    "images_wanaka",
    "street_addresses_wanaka",
    "localities_wanaka",
    "emails_wanaka",
    "phone_numbers_wanaka",
]

for i, name in enumerate(wanaka_scrapped_lists):
    print(f"List length of {list_names[i]}: {len(name)}")

List length of titles_wanaka: 50
List length of links_wanaka: 50
List length of descriptions_wanaka: 50
List length of images_wanaka: 50
List length of street_addresses_wanaka: 50
List length of localities_wanaka: 50
List length of emails_wanaka: 50
List length of phone_numbers_wanaka: 50


## Convert to Dataframe

In [58]:

data_auckland = pd.DataFrame({
    "place": ["Auckland"] * len(titles_auckland),
    "activities": titles_auckland,
    "activity_descriptions": descriptions_auckland,
    "activity_address_streets": street_addresses_auckland,
    "activity_localities": localities_auckland,
    "activity_emails": emails_auckland,
    "activity_phone_numbers": phone_numbers_auckland,
    "activity_links": links_auckland,
    "activity_images" : images_auckland
})


data_queenstown = pd.DataFrame({
    "place": ["Queenstown"] * len(titles_queenstown),
    "activities": titles_queenstown,
    "activity_descriptions": descriptions_queenstown,
    "activity_address_streets": street_addresses_queenstown,
    "activity_localities": localities_queenstown,
    "activity_emails": emails_queenstown,
    "activity_phone_numbers": phone_numbers_queenstown,
    "activity_links": links_queenstown,
    "activity_images" : images_queenstown
})

data_tekapo = pd.DataFrame({
    "place": ["Tekapo"] * len(titles_tekapo),
    "activities": titles_tekapo,
    "activity_descriptions": descriptions_tekapo,
    "activity_address_streets": street_addresses_tekapo,
    "activity_localities": localities_tekapo,
    "activity_emails": emails_tekapo,
    "activity_phone_numbers": phone_numbers_tekapo,
    "activity_links": links_tekapo,
    "activity_images" : images_tekapo
})

data_wanaka = pd.DataFrame({
    "place": ["Wanaka"] * len(titles_wanaka),
    "activities": titles_wanaka,
    "activity_descriptions": descriptions_wanaka,
    "activity_address_streets": street_addresses_wanaka,
    "activity_localities": localities_wanaka,
    "activity_emails": emails_wanaka,
    "activity_phone_numbers": phone_numbers_wanaka,
    "activity_links": links_wanaka,
    "activity_images" : images_wanaka
})

In [61]:
data_auckland.tail(3)

Unnamed: 0,place,activities,activity_descriptions,activity_address_streets,activity_localities,activity_emails,activity_phone_numbers,activity_links,activity_images
47,Auckland,Zahn - Auckland Wedding Photographer,"I photograph honest moments, beautiful places,...",139 Victoria Street West,Auckland Central,zahn@Zahn.co.nz,+64 21 156 3386,https://www.newzealand.com/int/plan/business/z...,https://www.newzealand.com/assets/externally-m...
48,Auckland,Auckland Whale & Dolphin Safari,Experience the diverse wildlife and stunning s...,"Berth 9, Eastern Viaduct Marina",Auckland Central,bookings@awads.co.nz,+64 9 357 6032,https://www.newzealand.com/int/plan/business/a...,https://www.newzealand.com/assets/externally-m...
49,Auckland,Sky Tower - SkyCity Auckland,The iconic 328 metre Sky Tower has stood tall ...,Cnr Victoria & Federal Streets,Auckland Central,enquiries@skycity.co.nz,+64 9 363 6000,https://www.newzealand.com/int/plan/business/s...,https://www.newzealand.com/assets/externally-m...


In [62]:
data_queenstown.tail(3)

Unnamed: 0,place,activities,activity_descriptions,activity_address_streets,activity_localities,activity_emails,activity_phone_numbers,activity_links,activity_images
47,Queenstown,Biking Queenstown Trail - No Boundary,Queenstown Trail is a spectacular trail networ...,30 Lookout Drive,Central Queenstown,hello@noboundary.co.nz,+64 21 245 4225,https://www.newzealand.com/int/plan/business/b...,https://www.newzealand.com/assets/Tourism-NZ/O...
48,Queenstown,Queenstown Local Sights Tour,Most of us when we travel want to do a local t...,11 Criterion Street,Central Queenstown,info@privatediscoverytours.co.nz,+64 27 472 7972,https://www.newzealand.com/int/plan/business/q...,https://www.newzealand.com/assets/externally-m...
49,Queenstown,Queenstown Paraflights - Triple Flight,Join us for the ultimate bird’s eye view over ...,"Main Town Pair, 1 Marine Parade",Central Queenstown,info@paraflights.co.nz,64 3 441 2242,https://www.newzealand.com/int/plan/business/q...,https://www.newzealand.com/assets/externally-m...


In [63]:
data_tekapo.tail(3)

Unnamed: 0,place,activities,activity_descriptions,activity_address_streets,activity_localities,activity_emails,activity_phone_numbers,activity_links,activity_images
47,Tekapo,Chameleon Stargazing - Shared Tour at our Loca...,Quality and budget Tekapo stargazing tours. Le...,2 Rapuwai Lane,Lake Tekapo/Takapō,info@chameleonstargazing.com,+64 27 365 7091,https://www.newzealand.com/int/plan/business/s...,https://www.newzealand.com/assets/externally-m...
48,Tekapo,Tekapo Adventures,Tekapo Adventures offers visitors the opportun...,"8 Rapuwai Lane,",Lake Tekapo/Takapō,info@tekapoadventures.com,+64 20 4156 4900,https://www.newzealand.com/int/plan/business/t...,https://www.newzealand.com/assets/externally-m...
49,Tekapo,Chameleon Stargazing - Private Tour at our Loc...,"Get away from the crowds, plan your romantic e...",2 Rapuwai Lane,Lake Tekapo/Takapō,info@chameleonstargazing.com,+64 27 365 7091,https://www.newzealand.com/int/plan/business/p...,https://www.newzealand.com/assets/externally-m...


In [64]:
data_wanaka.tail(3)

Unnamed: 0,place,activities,activity_descriptions,activity_address_streets,activity_localities,activity_emails,activity_phone_numbers,activity_links,activity_images
47,Wanaka,Wanaka Water Taxi Mou Waho Island Tour,Come and join us on a trip to our favourite pl...,"Wanaka Marina, Lakeside Road",Wānaka Town,info@wanakawatertaxi.co.nz,+64 21 1520 689,https://www.newzealand.com/int/plan/business/w...,https://www.newzealand.com/assets/externally-m...
48,Wanaka,Private 1 Day Wanaka Photography Tour,Wanaka is one of the most photographed destina...,Wanaka,Wānaka Town,info@photographyworkshops.co.nz,+64 27 261 4417,https://www.newzealand.com/int/plan/business/-...,https://www.newzealand.com/assets/externally-m...
49,Wanaka,Lakeland Jet Boat | Lake Wanaka & Clutha River,Experience one of New Zealand’s most iconic ad...,100 Ardmore Street,Wānaka Town,contact@lakelandwanaka.com,+64 3 443 7495,https://www.newzealand.com/int/plan/business/c...,https://www.newzealand.com/assets/externally-m...


In [None]:
data_all_city = []

data_all_city.append(data_auckland)
data_all_city.append(data_queenstown)
data_all_city.append(data_tekapo)
data_all_city.append(data_wanaka)

final_data = pd.concat(data_all_city, ignore_index=True)

In [68]:
final_data

Unnamed: 0,place,activities,activity_descriptions,activity_address_streets,activity_localities,activity_emails,activity_phone_numbers,activity_links,activity_images
0,Auckland,Auckland Scenic Tour 3 Hour,Auckland Scenic Tour travelling over the Auckl...,6 Customs Street East,Auckland Central,waihekewinetours@gmail.com,+64 21 438 222,https://www.newzealand.com/int/plan/business/a...,https://www.newzealand.com/assets/externally-m...
1,Auckland,Odysseum Auckland,Odysseum Auckland has two amazing entertainmen...,291-297 Queen Street,Auckland Central,auckland@odysseum.co.nz,+64 9 365 1145,https://www.newzealand.com/int/plan/business/o...,https://www.newzealand.com/assets/externally-m...
2,Auckland,Auckland Museum,Auckland Museum tells the story of New Zealand...,Auckland Domain,Auckland Central,info@aucklandmuseum.com,+64 9 309 0443,https://www.newzealand.com/int/plan/business/a...,https://www.newzealand.com/assets/externally-m...
3,Auckland,Auckland Tours,Enjoy a range of small group tours with Bush a...,3A Enterprise Drive,Auckland Central,info@bushandbeach.co.nz,+64 9 837 4130,https://www.newzealand.com/int/plan/business/a...,https://www.newzealand.com/assets/externally-m...
4,Auckland,Skydive Auckland,Experience the highest skydive in New Zealand ...,73 Green Road,Helensville,info@skydiveauckland.com,+64 21 921 659,https://www.newzealand.com/int/plan/business/s...,https://www.newzealand.com/assets/externally-m...
...,...,...,...,...,...,...,...,...,...
195,Wanaka,Southern Lakes Helibike,The ultimate day out in Wanaka: Scenic helicop...,10 Lloyd Dunn Ave,Wānaka Town,info@southernlakeshelibike.com,+64 3 443 4000,https://www.newzealand.com/int/plan/business/s...,https://www.newzealand.com/assets/externally-m...
196,Wanaka,Boat & Bike Combo,The only guided Boat/Bike Combo on Lake Wanaka...,103 Ardmore Street,Wānaka Town,info@discoverwanaka.com,+64 21 919 468,https://www.newzealand.com/int/plan/business/w...,https://www.newzealand.com/assets/externally-m...
197,Wanaka,Wanaka Water Taxi Mou Waho Island Tour,Come and join us on a trip to our favourite pl...,"Wanaka Marina, Lakeside Road",Wānaka Town,info@wanakawatertaxi.co.nz,+64 21 1520 689,https://www.newzealand.com/int/plan/business/w...,https://www.newzealand.com/assets/externally-m...
198,Wanaka,Private 1 Day Wanaka Photography Tour,Wanaka is one of the most photographed destina...,Wanaka,Wānaka Town,info@photographyworkshops.co.nz,+64 27 261 4417,https://www.newzealand.com/int/plan/business/-...,https://www.newzealand.com/assets/externally-m...


## Export to Pickle

In [70]:
final_data.to_pickle("pickle_dump/all_city_activities.pkl")
data_auckland.to_pickle("pickle_dump/auckland_city_activities.pkl")
data_queenstown.to_pickle("pickle_dump/queenstown_city_activities.pkl")
data_tekapo.to_pickle("pickle_dump/tekapo_city_activities.pkl")
data_wanaka.to_pickle("pickle_dump/wanaka_city_activities.pkl")

In [71]:
driver_firefox.quit()