In [None]:
!pip install beautifulsoup4




[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: C:\Users\Benjo\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [6]:
!pip install selenium webdriver-manager -q


[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: C:\Users\Benjo\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [8]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
import time
import sys
from urllib.parse import urljoin, urlparse, parse_qs
from bs4 import BeautifulSoup
import json
import os

BASE_URL = "https://www.autolist.com"
STARTING_URL_TEMPLATE = "https://www.autolist.com/listings#door_count[]=2&driveline[]=4X4&exclude_no_price=true&exclude_regional=true&limit=100&page={page_num}&radius=Any"

MAX_PAGES_TO_SCRAPE = 705
REQUEST_DELAY = 1.5
ACTION_DELAY = 1
RESTART_DRIVER_EVERY_N_PAGES = 5
JSON_OUTPUT_PATH = 'autolist_car_links.json'

all_ad_links = set()

# Load existing links from file (if any)
if os.path.exists(JSON_OUTPUT_PATH):
    with open(JSON_OUTPUT_PATH, 'r', encoding='utf-8') as f:
        try:
            saved_links = json.load(f)
            all_ad_links.update(saved_links)
            print(f"Loaded {len(saved_links)} links from existing file.")
        except json.JSONDecodeError:
            print("Warning: JSON file exists but could not be decoded.")

def web_driver():
    options = webdriver.ChromeOptions()
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-gpu')
    options.add_argument("--window-size=1280,1024")
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36")
    options.add_argument("accept-language=en-US,en;q=0.9")
    driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
    return driver

def append_link_to_json(link, filepath):
    # Append a single link to the JSON file
    if not os.path.exists(filepath):
        with open(filepath, 'w', encoding='utf-8') as f:
            json.dump([link], f, ensure_ascii=False, indent=4)
    else:
        with open(filepath, 'r+', encoding='utf-8') as f:
            try:
                data = json.load(f)
            except json.JSONDecodeError:
                data = []
            if link not in data:
                data.append(link)
                f.seek(0)
                json.dump(data, f, ensure_ascii=False, indent=4)
                f.truncate()

driver = None
try:
    print("Starting link collection...")

    driver = web_driver()
    wait = WebDriverWait(driver, 1.5)

    for page_num in range(1, MAX_PAGES_TO_SCRAPE + 1):
        # Restart driver periodically
        if page_num % RESTART_DRIVER_EVERY_N_PAGES == 0:
            print(f"Restarting WebDriver at page {page_num} to avoid timeouts...")
            driver.quit()
            driver = web_driver()
            wait = WebDriverWait(driver, 1.5)

        current_url = STARTING_URL_TEMPLATE.format(page_num=page_num)
        print(f"\n--- Processing Page {page_num}/{MAX_PAGES_TO_SCRAPE} ---")
        print(f"URL: {current_url}")
        sys.stdout.flush()

        try:
            driver.get(current_url)
            wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[data-testid="search-result"] a[data-testid="details-anchor"]')))
            time.sleep(ACTION_DELAY)

            soup = BeautifulSoup(driver.page_source, 'html.parser')
            link_tags_on_page = soup.select('a[data-testid="details-anchor"]')

            if not link_tags_on_page:
                print(f"No ad links found on page {page_num}.")
                time.sleep(REQUEST_DELAY)
                continue

            new_links_found_this_page = 0
            for link_tag in link_tags_on_page:
                href = link_tag.get('href')
                if href:
                    full_url = href
                    if full_url not in all_ad_links:
                        all_ad_links.add(full_url)
                        new_links_found_this_page += 1
                        append_link_to_json(full_url, JSON_OUTPUT_PATH)

            print(f"Found {len(link_tags_on_page)} link tags. Added {new_links_found_this_page} new unique links.")
            print(f"Total unique links collected so far: {len(all_ad_links)}")

        except TimeoutException:
            print(f"Timeout on page {page_num}. Skipping.")
        except Exception as e:
            print(f"Error on page {page_num}: {e}")

        time.sleep(REQUEST_DELAY)

except KeyboardInterrupt:
    print("\nScraping interrupted by user.")
except Exception as e_outer:
    print(f"A critical error occurred: {e_outer}")
finally:
    if driver:
        driver.quit()
        print("WebDriver closed.")
    print(f"\nFinal total unique links collected: {len(all_ad_links)}")


Loaded 94379 links from existing file.
Starting link collection...

--- Processing Page 1/705 ---
URL: https://www.autolist.com/listings#door_count[]=2&driveline[]=4X4&exclude_no_price=true&exclude_regional=true&limit=100&page=1&radius=Any
Found 20 link tags. Added 20 new unique links.
Total unique links collected so far: 94399

--- Processing Page 2/705 ---
URL: https://www.autolist.com/listings#door_count[]=2&driveline[]=4X4&exclude_no_price=true&exclude_regional=true&limit=100&page=2&radius=Any
Found 100 link tags. Added 100 new unique links.
Total unique links collected so far: 94499

--- Processing Page 3/705 ---
URL: https://www.autolist.com/listings#door_count[]=2&driveline[]=4X4&exclude_no_price=true&exclude_regional=true&limit=100&page=3&radius=Any
Found 100 link tags. Added 100 new unique links.
Total unique links collected so far: 94599

--- Processing Page 4/705 ---
URL: https://www.autolist.com/listings#door_count[]=2&driveline[]=4X4&exclude_no_price=true&exclude_regional=