In [None]:
import os
import random
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Selenium setup
chrome_options = Options()
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--window-size=1920x1080")
chrome_options.add_argument("--log-level=3")
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
chrome_options.add_experimental_option("excludeSwitches", ["enable-logging"])

driver_path = "C:\\Users\\yujit\\OneDrive\\Desktop\\chromedriver-win64\\chromedriver.exe"  # Change this to your ChromeDriver path
service = Service(driver_path)
driver = webdriver.Chrome(service=service, options=chrome_options)

# Folder to store data
SAVE_FOLDER = "Australiadt"
os.makedirs(SAVE_FOLDER, exist_ok=True)

# Load last scraped page (if exists)
resume_file = os.path.join(SAVE_FOLDER, "resume_page.txt")
start_page = 1
if os.path.exists(resume_file):
    with open(resume_file, "r") as f:
        start_page = int(f.read().strip())

data_list = []
pages_scraped = 0
file_counter = (start_page - 1) // 10 + 1  # Start file numbering correctly

# Step 1: Start on Page 1 and Click Next Until Last Scraped Page
driver.get("https://data.gov.au/search?page=1")
print(f"Starting from Page 1. Clicking Next until Page {start_page}...")

current_page = 1
while current_page < start_page:
    try:
        next_button = WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, "a.btn-next"))
        )
        driver.execute_script("arguments[0].click();", next_button)
        current_page += 1
        time.sleep(random.uniform(3, 5))  # Random delay to mimic human behavior
        print(f"Reached Page {current_page}")
    except:
        print("Next button not found. Stopping early.")
        break

# Step 2: Start Scraping from Last Scraped Page
print(f"Starting scraping from Page {start_page} onwards...")

try:
    while True:
        print(f"Scraping Page {start_page}")

        # Wait for datasets to load
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "li.search-results__result"))
        )

        datasets = driver.find_elements(By.CSS_SELECTOR, "li.search-results__result")

        if not datasets:
            print("No more data found. Scraping complete!")
            break

        for dataset in datasets:
            try:
                title_element = dataset.find_elements(By.CSS_SELECTOR, ".dataset-summary-title a")
                title = title_element[0].text.strip() if title_element else "N/A"

                publisher_element = dataset.find_elements(By.CSS_SELECTOR, ".dataset-summary-publisher")
                publisher = publisher_element[0].text.strip() if publisher_element else "N/A"

                updated_element = dataset.find_elements(By.CSS_SELECTOR, ".dataset-summary-updated")
                updated_date = updated_element[0].text.replace("Dataset Updated ", "").strip() if updated_element else "N/A"

                # Extract available types
                types_elements = dataset.find_elements(By.CSS_SELECTOR, ".dataset-summary-downloads span")
                types = ", ".join([type_el.text.strip() for type_el in types_elements if type_el.text.strip()]) if types_elements else "N/A"

                data_list.append([title, publisher, updated_date, types])

            except Exception as e:
                print(f"Error extracting dataset: {e}")

        pages_scraped += 1
        start_page += 1

        # Save data every 10 pages
        if pages_scraped % 10 == 0:
            df = pd.DataFrame(data_list, columns=["Title", "Publisher", "Updated Date", "Types Available"])
            file_path = os.path.join(SAVE_FOLDER, f"Australian_Data_{file_counter}.xlsx")
            df.to_excel(file_path, index=False)
            print(f"Saved: {file_path}")

            data_list = []
            file_counter += 1

            with open(resume_file, "w") as f:
                f.write(str(start_page))

        # Click next page
        try:
            next_page_button = WebDriverWait(driver, 5).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, "a.btn-next"))
            )
            driver.execute_script("arguments[0].click();", next_page_button)
            time.sleep(random.uniform(3, 6))  # Random delay
        except:
            print(f"Next page button not found. Stopping at page {start_page}.")
            break

except Exception as e:
    print(f"Error encountered: {e}")
    with open(resume_file, "w") as f:
        f.write(str(start_page))

finally:
    driver.quit()

    if data_list:
        df = pd.DataFrame(data_list, columns=["Title", "Publisher", "Updated Date", "Types Available"])
        file_path = os.path.join(SAVE_FOLDER, f"Australian_Data_{file_counter}.xlsx")
        df.to_excel(file_path, index=False)
        print(f"Saved remaining data: {file_path}")

        with open(resume_file, "w") as f:
            f.write(str(start_page))

    print("Scraping finished!")


Starting from Page 1. Clicking Next until Page 1200...
Reached Page 2
Reached Page 3
Reached Page 4
Reached Page 5
Reached Page 6
Reached Page 7
Reached Page 8
Reached Page 9
Reached Page 10
Reached Page 11
Reached Page 12
Reached Page 13
Reached Page 14
Reached Page 15
Reached Page 16
Reached Page 17
Reached Page 18
Reached Page 19
Reached Page 20
Reached Page 21
Reached Page 22
Reached Page 23
Reached Page 24
Reached Page 25
Reached Page 26
Reached Page 27
Reached Page 28
Reached Page 29
Reached Page 30
Reached Page 31
Reached Page 32
Reached Page 33
Reached Page 34
