In [9]:
!pip install beautifulsoup4





[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: C:\Users\Benjo\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [1]:
!pip install selenium webdriver-manager -q


[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: C:\Users\Benjo\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [11]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
import time
import sys
from urllib.parse import urljoin, urlparse, parse_qs
from bs4 import BeautifulSoup 
import json 

BASE_URL = "https://www.autolist.com"
# Corrected URL: page number is a query parameter, not in the fragment for pagination control
STARTING_URL_TEMPLATE = "https://www.autolist.com/listings#exclude_no_price=true&limit=20&page={page_num}&radius=Any"

MAX_PAGES_TO_SCRAPE = 5 # Scrape first 5 pages

REQUEST_DELAY = 1.5
ACTION_DELAY = 1

all_ad_links = set()
# We will save to JSON directly now
# LINKS_FILE_PATH = 'autolist_car_links.txt' # Not saving to .txt anymore

def web_driver():
    options = webdriver.ChromeOptions()
    options.add_argument('--no-sandbox')
    # options.add_argument('--headless') # Keep headless commented out for initial testing if you want to see the browser
    options.add_argument('--disable-gpu')
    options.add_argument("--window-size=1280,1024")
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36")
    options.add_argument("accept-language=en-US,en;q=0.9")
    driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
    return driver

# Not needed for this specific 5-page print-to-screen task
# def load_scraped_links(filepath):
#     try:
#         with open(filepath, 'r') as f:
#             return set(line.strip() for line in f)
#     except FileNotFoundError:
#         return set()

def save_links_to_json(links_set, filepath):
    # Convert set to list for JSON serialization
    links_list = sorted(list(links_set))
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(links_list, f, ensure_ascii=False, indent=4)
    print(f"Saved {len(links_list)} links to {filepath}")

driver = None
try:
    # all_ad_links = load_scraped_links(LINKS_FILE_PATH) # Not loading for this specific task
    print(f"Starting link collection...")

    driver = web_driver()
    wait = WebDriverWait(driver, 10) # Increased wait time for Autolist

    for page_num in range(1, MAX_PAGES_TO_SCRAPE + 1):
        current_url = STARTING_URL_TEMPLATE.format(page_num=page_num)
        print(f"\n--- Processing Page {page_num}/{MAX_PAGES_TO_SCRAPE} ---")
        print(f"URL: {current_url}")
        sys.stdout.flush()

        try:
            driver.get(current_url)
            # Wait for the container of search results (e.g., div.jss100 or specific listings)
            # Using the specific data-testid is good for robustness
            wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[data-testid="search-result"] a[data-testid="details-anchor"]')))
            time.sleep(ACTION_DELAY) # Give a bit more time for JS to settle

            page_source = driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')

            link_tags_on_page = soup.select('a[data-testid="details-anchor"]')

            if not link_tags_on_page:
                print(f"No ad links found on page {page_num} using selector.")
                time.sleep(REQUEST_DELAY)
                continue

            new_links_found_this_page = 0
            for link_tag in link_tags_on_page:
                href = link_tag.get('href')
                if href:
                    # Autolist links are already absolute (e.g., https://www.autolist.com/...)
                    # We are interested in the full URL that contains the VIN in the fragment
                    full_url = href # It is already an absolute URL
                    if full_url not in all_ad_links:
                        all_ad_links.add(full_url)
                        new_links_found_this_page += 1

            print(f"Found {len(link_tags_on_page)} link tags. Added {new_links_found_this_page} new unique links.")
            print(f"Total unique links collected: {len(all_ad_links)}")

        except TimeoutException:
            print(f"Timeout waiting for elements on page {page_num}. Skipping.")
        except Exception as e:
            print(f"Error processing page {page_num}: {e}")

        time.sleep(REQUEST_DELAY)

except KeyboardInterrupt:
    print("\nLink collection interrupted by user.")
except Exception as e_outer:
    print(f"A critical error occurred in the main loop: {e_outer}")
finally:
    if driver:
        driver.quit()
        print("WebDriver closed.")

    print("\n--- Link Collection Phase Finished ---")
    print(f"Total unique ad links collected: {len(all_ad_links)}")

    # Convert set to list for JSON output
    links_list_for_json = sorted(list(all_ad_links))

    print("\nCollected Links (JSON format):")
    print(json.dumps(links_list_for_json, indent=4))

    if links_list_for_json:
        json_output_path = 'autolist_car_links_first_5_pages.json'
        save_links_to_json(links_list_for_json, json_output_path)
    else:
        print("No links were collected.")

Starting link collection...

--- Processing Page 1/5 ---
URL: https://www.autolist.com/listings#exclude_no_price=true&limit=20&page=1&radius=Any
Found 20 link tags. Added 20 new unique links.
Total unique links collected: 20

--- Processing Page 2/5 ---
URL: https://www.autolist.com/listings#exclude_no_price=true&limit=20&page=2&radius=Any
Found 20 link tags. Added 20 new unique links.
Total unique links collected: 40

--- Processing Page 3/5 ---
URL: https://www.autolist.com/listings#exclude_no_price=true&limit=20&page=3&radius=Any
Found 20 link tags. Added 20 new unique links.
Total unique links collected: 60

--- Processing Page 4/5 ---
URL: https://www.autolist.com/listings#exclude_no_price=true&limit=20&page=4&radius=Any
Found 20 link tags. Added 20 new unique links.
Total unique links collected: 80

--- Processing Page 5/5 ---
URL: https://www.autolist.com/listings#exclude_no_price=true&limit=20&page=5&radius=Any
Found 20 link tags. Added 20 new unique links.
Total unique links c