In [1]:
!pip install selenium webdriver-manager pandas -q


[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: C:\Users\Benjo\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [4]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
import time
import pandas as pd
import json
import re
import sys
from urllib.parse import urlparse, parse_qs
import math # For math.isnan

# --- Configuration ---
LINKS_JSON_FILE_PATH = 'autolist_car_links.json'
OUTPUT_DATA_JSON_PATH = 'autolist_car_details.json'

REQUEST_DELAY_DETAIL_PAGE = 2.5
MAX_ADS_TO_SCRAPE = 10

# --- Helper Functions ---
def clean_price(price_str):
    if price_str is None: return None
    price_str = price_str.replace('$', '').replace(',', '').strip()
    try:
        return int(float(price_str))
    except ValueError:
        return None

def clean_mileage(mileage_str):
    if mileage_str is None: return None
    mileage_str = mileage_str.lower().replace('miles', '').replace(',', '').strip()
    try:
        return int(float(mileage_str))
    except ValueError:
        return None

def extract_monthly_payment(payment_str):
    if payment_str is None: return None
    match = re.search(r'\$(\d{1,3}(?:,\d{3})*(\.\d+)?)/mo', payment_str)
    if match:
        try:
            return int(float(match.group(1).replace(',', '')))
        except ValueError:
            return None
    return None

def clean_text(text):
    return text.strip() if text else None

# --- WebDriver Setup ---
def web_driver():
    options = webdriver.ChromeOptions()
    options.add_argument('--no-sandbox')
    options.add_argument('--headless') 
    options.add_argument('--disable-gpu')
    options.add_argument("--window-size=1280,1024")
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36")
    driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
    return driver

# --- Main Scraping Logic ---
all_scraped_car_data = []
processed_links = set() 
driver = None

try:
    # Load all collected links
    print(f"Loading links from {LINKS_JSON_FILE_PATH}...")
    try:
        with open(LINKS_JSON_FILE_PATH, 'r') as f:
            all_links_from_file = json.load(f)
    except FileNotFoundError:
        print(f"ERROR: Links file {LINKS_JSON_FILE_PATH} not found. Please run Phase 1 first.")
        sys.exit()
    except json.JSONDecodeError:
        print(f"ERROR: Could not decode JSON from {LINKS_JSON_FILE_PATH}. File might be corrupted.")
        sys.exit()

    # Deduplicate links (though they should be unique if saved from a set)
    unique_ad_links = sorted(list(set(all_links_from_file)))
    print(f"Loaded {len(all_links_from_file)} links, {len(unique_ad_links)} unique links to process.")

    if not unique_ad_links:
        print("No links to process. Exiting.")
        sys.exit()

    driver = web_driver()
    wait = WebDriverWait(driver, 20) # Wait up to 20 seconds for elements

    # --- Optionally load previously scraped data to resume ---
    try:
        df_existing = pd.read_json(OUTPUT_DATA_JSON_PATH, orient='records', lines=False) # Assuming one JSON array
        all_scraped_car_data = df_existing.to_dict('records')
        for record in all_scraped_car_data:
            if 'Scraped_URL' in record: # Assuming you add the URL as a field
                 processed_links.add(record['Scraped_URL'])
        print(f"Loaded {len(all_scraped_car_data)} previously scraped car details. Already processed {len(processed_links)} links.")
    except (FileNotFoundError, ValueError): # ValueError for empty/invalid JSON
        print("No existing data file found or file is empty/invalid. Starting fresh.")
        all_scraped_car_data = []
        processed_links = set()


    links_to_scrape_count = 0
    for i, ad_url in enumerate(unique_ad_links):
        if MAX_ADS_TO_SCRAPE is not None and links_to_scrape_count >= MAX_ADS_TO_SCRAPE:
            print(f"Reached MAX_ADS_TO_SCRAPE limit of {MAX_ADS_TO_SCRAPE}.")
            break

        if ad_url in processed_links:
            # print(f"Skipping already processed link: {ad_url}")
            continue # Skip if already processed in a previous run

        print(f"\nProcessing ad {i+1}/{len(unique_ad_links)}: {ad_url}")
        sys.stdout.flush()
        car_details = {'Scraped_URL': ad_url} # Add the URL itself for tracking

        try:
            driver.get(ad_url)

            wait.until(EC.visibility_of_element_located((By.XPATH, '//*[@id="vdp-scroll-container"]/div/div[2]/div[1]/div[1]/div[1]/div[3]/div/div[1]/div[1]'))) # Wait for Name
            time.sleep(ACTION_DELAY) # Give some time for JS to populate everything

            # --- Name ---
            try:
                name_element = driver.find_element(By.XPATH, '//*[@id="vdp-scroll-container"]/div/div[2]/div[1]/div[1]/div[1]/div[3]/div/div[1]/div[1]')
                car_details['Name'] = clean_text(name_element.text)
            except NoSuchElementException:
                car_details['Name'] = None
                print("  - Name not found")

            # --- Price ---
            try:
                price_element = driver.find_element(By.XPATH, '//*[@id="vdp-scroll-container"]/div/div[2]/div[1]/div[1]/div[1]/div[3]/div/div[1]/div[2]/div[1]')
                car_details['Price'] = clean_price(price_element.text)
            except NoSuchElementException:
                car_details['Price'] = None
                print("  - Price not found")

            # --- PricePerMonth ---
            try:
                price_per_month_element = driver.find_element(By.XPATH, '//*[@id="vdp-scroll-container"]/div/div[2]/div[1]/div[1]/div[1]/div[3]/div/div[1]/div[2]/div[2]')
                car_details['PricePerMonth'] = extract_monthly_payment(price_per_month_element.text)
            except NoSuchElementException:
                car_details['PricePerMonth'] = None

            # --- Location (from top summary) ---
            try:
                location_element = driver.find_element(By.XPATH, '//*[@id="vdp-scroll-container"]/div/div[2]/div[1]/div[1]/div[1]/div[3]/div/div[2]/div[3]')
                car_details['Location_Summary'] = clean_text(location_element.text)
            except NoSuchElementException:
                car_details['Location_Summary'] = None

            # --- Days on Market & Price Comparison (from top summary) ---
            try:
                price_comp_element = driver.find_element(By.CSS_SELECTOR, 'div.jsx-b0d76f52871971e9.price-comparison-text')
                car_details['Price_Comparison_Text'] = clean_text(price_comp_element.text)
            except: car_details['Price_Comparison_Text'] = None

            try:
                days_market_element = driver.find_element(By.CSS_SELECTOR, 'div.jsx-b0d76f52871971e9.time-on-market')
                car_details['Days_On_Market'] = clean_text(days_market_element.text) # Will be like "6"
            except: car_details['Days_On_Market'] = None


            page_content_for_bs = driver.page_source
            soup_detail = BeautifulSoup(page_content_for_bs, 'html.parser')

            vehicle_info_dl = soup_detail.find('dl', class_='property-list')
            if vehicle_info_dl:
                dts = vehicle_info_dl.find_all('dt')
                dds = vehicle_info_dl.find_all('dd')

                for dt, dd in zip(dts, dds):
                    spec_name = clean_text(dt.text.replace(':', ''))
                    spec_value = clean_text(dd.text)

                    if spec_name == "Mileage":
                        car_details['Mileage'] = clean_mileage(spec_value)
                    elif spec_name == "Trim":
                        car_details['Trim'] = spec_value
                    elif spec_name == "Engine":
                        car_details['Engine'] = spec_value
                    elif spec_name == "VIN":
                        car_details['VIN'] = spec_value
                    elif spec_name == "Exterior color":
                        car_details['Exterior_Color'] = spec_value
                    elif spec_name == "Condition":
                        car_details['Condition'] = spec_value
                    elif spec_name == "Combined gas mileage": # Renamed to avoid space
                        car_details['Gas_Mileage_Combined'] = spec_value
                    elif spec_name == "Doors":
                        car_details['Doors'] = spec_value
                    elif spec_name == "Transmission":
                        car_details['Transmission'] = spec_value
                    elif spec_name == "Drivetrain":
                        car_details['Drivetrain'] = spec_value
                    elif spec_name == "Fuel type":
                        car_details['Fuel_Type'] = spec_value
                    elif spec_name == "Interior color":
                        car_details['Interior_Color'] = spec_value
                    elif spec_name == "Body type": # Was Body Style
                        car_details['Body_Style'] = spec_value
                    elif spec_name == "Stock #": # Was Stock#
                        car_details['Stock_Number'] = spec_value
                    # Add more specific elif for other known dt texts if needed
            else:
                print("  - Could not find <dl class='property-list'> for detailed specs.")


            # --- Key Features (Interior, Exterior, Safety, Other) ---
            # These are usually in <ul> elements following specific <h2> or <div> titles
            feature_sections_map = {
                "Interior": "Interior_Features",
                "Exterior": "Exterior_Features",
                "Safety": "Safety_Features",
                "Other": "Other_Features"
            }
            
            all_feature_titles = soup_detail.find_all('div', class_=lambda x: x and x.startswith('title') and x.endswith('jsx-'))

            for title_div in all_feature_titles:
                section_title_text = clean_text(title_div.text)
                if section_title_text in feature_sections_map:
                    ul_features = title_div.find_next_sibling('ul', attrs={'data-testid': 'feature-list'})
                    if ul_features:
                        features_list = [clean_text(li.text) for li in ul_features.find_all('li', class_='feature-list-item')]
                        car_details[feature_sections_map[section_title_text]] = features_list
                    else:
                         car_details[feature_sections_map[section_title_text]] = []
                elif section_title_text == "Vehicle Information": # To ensure we got all from dl
                    pass # Already handled by dl.property-list

            all_scraped_car_data.append(car_details)
            processed_links.add(ad_url) # Mark as processed
            links_to_scrape_count += 1

        except TimeoutException:
            print(f"  - Timeout waiting for details on page: {ad_url}")
        except Exception as e_detail:
            print(f"  - Error scraping details for {ad_url}: {e_detail}")

        # Save progress periodically
        if links_to_scrape_count > 0 and links_to_scrape_count % 50 == 0: # Save every 50 cars
            temp_df = pd.DataFrame(all_scraped_car_data)
            save_links_to_json(temp_df.to_dict('records'), OUTPUT_DATA_JSON_PATH) # Save all data processed so far
            print(f"--- Progress saved after {links_to_scrape_count} cars ---")

        time.sleep(REQUEST_DELAY_DETAIL_PAGE)

except KeyboardInterrupt:
    print("\nScraping process interrupted by user.")
except Exception as e_main:
    print(f"A critical error occurred in the main scraping process: {e_main}")
finally:
    if driver:
        driver.quit()
        print("WebDriver closed.")

    print("\n--- Individual Ad Scraping Finished (or Interrupted) ---")
    if all_scraped_car_data:
        final_df = pd.DataFrame(all_scraped_car_data)
        print(f"Total cars scraped: {len(final_df)}")
        print("\nSample of scraped data:")
        print(final_df.head().to_string())

        # Save final data to JSON
        print(f"\nSaving final data to {OUTPUT_DATA_JSON_PATH}...")
        save_links_to_json(final_df.to_dict('records'), OUTPUT_DATA_JSON_PATH) # Overwrites with full data

        print("All data saved.")
    else:
        print("No car data was successfully scraped in this session.")

Loading links from autolist_car_links.json...
Loaded 104399 links, 104399 unique links to process.
No existing data file found or file is empty/invalid. Starting fresh.

Processing ad 1/104399: https://www.autolist.com/listings#door_count[]=2&driveline[]=4X4&exclude_no_price=true&exclude_regional=true&limit=20&page=1&radius=Any&vin=1C4AJWAG5GL335490
  - Error scraping details for https://www.autolist.com/listings#door_count[]=2&driveline[]=4X4&exclude_no_price=true&exclude_regional=true&limit=20&page=1&radius=Any&vin=1C4AJWAG5GL335490: name 'ACTION_DELAY' is not defined

Processing ad 2/104399: https://www.autolist.com/listings#door_count[]=2&driveline[]=4X4&exclude_no_price=true&exclude_regional=true&limit=20&page=1&radius=Any&vin=1C4AJWAGXEL327317
  - Error scraping details for https://www.autolist.com/listings#door_count[]=2&driveline[]=4X4&exclude_no_price=true&exclude_regional=true&limit=20&page=1&radius=Any&vin=1C4AJWAGXEL327317: name 'ACTION_DELAY' is not defined

Scraping proce