In [1]:
!pip install selenium webdriver-manager pandas -q


[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: C:\Users\Benjo\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [None]:
# (Keep all your existing imports)
import pandas as pd
import json
import re
import sys
from urllib.parse import urlparse, parse_qs
import math
import time

from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, ElementClickInterceptedException
from webdriver_manager.chrome import ChromeDriverManager

# --- Configuration ---
LINKS_JSON_FILE_PATH = 'autolist_car_links.json'
OUTPUT_DATA_JSON_PATH = 'autolist_car_details.json'

REQUEST_DELAY_DETAIL_PAGE = 1
MAX_ADS_TO_SCRAPE = 104399
ACTION_DELAY = 1

# --- Helper Functions ---
def clean_price(price_str):
    if price_str is None: return None
    price_str = price_str.replace('$', '').replace(',', '').strip()
    try:
        return int(float(price_str))
    except ValueError:
        return None

def clean_mileage(mileage_str):
    if mileage_str is None: return None
    mileage_str = mileage_str.lower().replace('miles', '').replace(',', '').strip()
    try:
        return int(float(mileage_str))
    except ValueError:
        return None

def extract_monthly_payment(payment_str):
    if payment_str is None: return None
    match = re.search(r'\$?(\d{1,3}(?:,\d{3})*(\.\d+)?)/mo', payment_str)
    if match:
        try:
            return int(float(match.group(1).replace(',', '')))
        except ValueError:
            return None
    return None

def clean_text(text):
    return text.strip() if text else None

# --- URL Transformation ---
def get_direct_vdp_url(fragment_url):
    try:
        parsed_url = urlparse(fragment_url)
        
        if "autolist.com" in parsed_url.netloc and re.match(r"/listings/[^/]+/?$", parsed_url.path):
            return f"https://{parsed_url.netloc}{parsed_url.path}"

        if parsed_url.fragment:
            fragment_params = parse_qs(parsed_url.fragment)
            vin = fragment_params.get('vin', [None])[0]
            if vin and "autolist.com" in parsed_url.netloc:
                base_path = parsed_url.path
                if base_path == "/" or base_path == "": 
                    base_path = "/listings/"
                elif not base_path.endswith("/"):
                    base_path += "/"
                if not base_path.startswith("/listings"):
                    base_path = "/listings/"
                return f"https://{parsed_url.netloc}{base_path}{vin}"
        
        if parsed_url.path == "/listings" and not any(q.startswith("vin=") for q in parsed_url.query.split('&')) and not (parsed_url.fragment and "vin=" in parsed_url.fragment):
            print(f"  - URL {fragment_url} does not seem to be a VDP link. Skipping transformation.")
            return None

        return fragment_url # Vraća originalni ako transformacija nije uspjela ili nije primjenjiva
    except Exception as e:
        print(f"  - Error transforming URL {fragment_url}: {e}")
        return fragment_url

# --- WebDriver Setup ---
def web_driver():
    options = webdriver.ChromeOptions()
    options.add_argument('--no-sandbox')
    options.add_argument('--headless') # Ostavite komentirano za inicijalno testiranje
    options.add_argument('--disable-gpu')
    options.add_argument("--window-size=1920,1080")
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36")
    options.add_argument("accept-language=en-US,en;q=0.9")
    driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
    return driver

# --- Function to save data to JSON ---
def save_data_to_json(data_list, filepath):
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(data_list, f, ensure_ascii=False, indent=4)
    print(f"Spremljeno {len(data_list)} zapisa u {filepath}")


# --- Main Scraping Logic ---
all_scraped_car_data = []
processed_vdp_links = set() 
driver = None

try:
    print(f"Učitavanje linkova iz {LINKS_JSON_FILE_PATH}...")
    try:
        with open(LINKS_JSON_FILE_PATH, 'r', encoding='utf-8') as f:
            all_links_from_file = json.load(f)
    except FileNotFoundError:
        print(f"GREŠKA: Datoteka s linkovima {LINKS_JSON_FILE_PATH} nije pronađena. Molimo prvo pokrenite Fazu 1.")
        sys.exit()
    except json.JSONDecodeError:
        print(f"GREŠKA: Nije moguće dekodirati JSON iz {LINKS_JSON_FILE_PATH}. Datoteka je možda oštećena.")
        sys.exit()

    unique_original_input_links = sorted(list(set(all_links_from_file)))
    print(f"Učitano {len(all_links_from_file)} linkova, {len(unique_original_input_links)} jedinstvenih originalnih linkova za obradu.")

    if not unique_original_input_links:
        print("Nema linkova za obradu. Izlazim.")
        sys.exit()

    driver = web_driver()
    wait = WebDriverWait(driver, 1)

    try:
        with open(OUTPUT_DATA_JSON_PATH, 'r', encoding='utf-8') as f:
            all_scraped_car_data = json.load(f)
        for record in all_scraped_car_data:
            if 'Scraped_URL' in record:
                 processed_vdp_links.add(record['Scraped_URL'])
        print(f"Učitano {len(all_scraped_car_data)} prethodno scrapanih detalja automobila. Već obrađeno {len(processed_vdp_links)} VDP linkova.")
    except (FileNotFoundError, json.JSONDecodeError):
        print("Postojeća datoteka s podacima nije pronađena ili je prazna/nevažeća. Počinjem iznova.")
        all_scraped_car_data = []
        processed_vdp_links = set()

    links_scraped_this_session = 0
    for i, original_url_from_file in enumerate(unique_original_input_links):
        if MAX_ADS_TO_SCRAPE is not None and links_scraped_this_session >= MAX_ADS_TO_SCRAPE:
            print(f"Dosegnut MAX_ADS_TO_SCRAPE limit od {MAX_ADS_TO_SCRAPE}.")
            break
        
        vdp_url = get_direct_vdp_url(original_url_from_file)
        if vdp_url is None:
            print(f"\nPreskačem ne-VDP ili netransformabilan link: {original_url_from_file}")
            continue

        if vdp_url in processed_vdp_links:
            continue

        print(f"\nObrada oglasa {links_scraped_this_session + 1} (Ukupni index {i+1}/{len(unique_original_input_links)}):")
        print(f"  Originalni URL iz datoteke: {original_url_from_file}")
        print(f"  Koristim VDP URL: {vdp_url}")
        sys.stdout.flush()
        
        car_details = {'Scraped_URL': vdp_url}

        try:
            driver.get(vdp_url)
            wait.until(EC.visibility_of_element_located((By.XPATH, "//div[contains(@class, 'vehicle-snapshot')]")))
            time.sleep(ACTION_DELAY)

            snapshot_base_xpath = "//div[contains(@class, 'vehicle-snapshot')]"
            
            # --- ISPRAVKA ZA NAME ---
            try:
                # Ciljamo direktno div s klasom 'title' unutar 'title-container', koji je unutar 'vehicle-snapshot'
                name_element = driver.find_element(By.XPATH, f"{snapshot_base_xpath}//div[contains(@class, 'title-container')]/div[contains(@class, 'title')]")
                # Alternativno, ako je 'title' klasa jedinstvena za ime unutar snapshot_base_xpath:
                # name_element = driver.find_element(By.XPATH, f"{snapshot_base_xpath}//div[contains(@class, 'title')]")
                car_details['Name'] = clean_text(name_element.text)
            except NoSuchElementException:
                car_details['Name'] = None; print("  - Ime (snapshot) nije pronađeno.")
            # --- KRAJ ISPRAVKE ZA NAME ---

            try:
                price_element = driver.find_element(By.XPATH, f"{snapshot_base_xpath}//div[contains(@class, 'price-container')]/div[contains(@class, 'price')]")
                car_details['Price'] = clean_price(price_element.text)
            except NoSuchElementException:
                car_details['Price'] = None; print("  - Cijena (snapshot) nije pronađena.")
            
            try:
                price_per_month_element = driver.find_element(By.XPATH, f"{snapshot_base_xpath}//div[contains(@class, 'monthly-payment')]")
                car_details['PricePerMonth'] = extract_monthly_payment(price_per_month_element.text)
            except NoSuchElementException:
                car_details['PricePerMonth'] = None

            mileage_snapshot_value = None
            try:
                mileage_el_snapshot = driver.find_element(By.XPATH, f"{snapshot_base_xpath}//div[contains(@class, 'mileage-desktop')]")
                mileage_snapshot_value = clean_mileage(mileage_el_snapshot.text)
            except NoSuchElementException:
                try:
                    mileage_el_snapshot = driver.find_element(By.XPATH, f"{snapshot_base_xpath}//div[contains(@class, 'mileage-mobile')]")
                    mileage_snapshot_value = clean_mileage(mileage_el_snapshot.text)
                except NoSuchElementException:
                    pass
            
            try:
                location_el_snapshot = driver.find_element(By.XPATH, f"{snapshot_base_xpath}//div[contains(@class, 'location')]")
                car_details['Location'] = clean_text(location_el_snapshot.text)
            except NoSuchElementException:
                car_details['Location'] = None
            
            mileage_detailed_value = None
            try:
                vehicle_info_container = wait.until(EC.presence_of_element_located((By.ID, "vehicle-information")))
                spec_item_divs = vehicle_info_container.find_elements(By.XPATH, ".//div[contains(@class, 'info-container')]/div[contains(@class, 'vehicle-info')]")
                if not spec_item_divs:
                     spec_item_divs = vehicle_info_container.find_elements(By.XPATH, ".//div[contains(@class, 'vehicle-info')]")

                for item_div in spec_item_divs:
                    try:
                        label_el = item_div.find_element(By.XPATH, ".//div[contains(@class, 'info-label')]")
                        value_el = item_div.find_element(By.XPATH, ".//div[contains(@class, 'info-data')]")
                        spec_name_raw = clean_text(label_el.text)
                        spec_value = clean_text(value_el.text)

                        if not spec_name_raw: continue
                        spec_name = spec_name_raw.replace(':', '').strip()

                        if spec_name == "Mileage": mileage_detailed_value = clean_mileage(spec_value)
                        elif spec_name == "Trim": car_details['Trim'] = spec_value
                        elif spec_name == "Engine": car_details['Engine'] = spec_value
                        elif spec_name == "VIN": car_details['VIN'] = spec_value
                        elif spec_name == "Exterior Color": car_details['Exterior_Color'] = spec_value
                        elif spec_name == "Condition": car_details['Condition'] = spec_value
                        elif spec_name == "Combined gas mileage": car_details['Gas_Mileage_Combined'] = spec_value
                        elif spec_name == "Doors": car_details['Doors'] = spec_value
                        elif spec_name == "Transmission": car_details['Transmission'] = spec_value
                        elif spec_name == "Drivetrain": car_details['Drivetrain'] = spec_value
                        elif spec_name == "Fuel type": car_details['Fuel_Type'] = spec_value
                        elif spec_name == "Interior Color": car_details['Interior_Color'] = spec_value
                        elif spec_name == "Body Style": car_details['Body_Style'] = spec_value
                        elif spec_name == "Stock #": car_details['Stock_Number'] = spec_value
                        else: car_details[f"Spec_{spec_name.replace(' ', '_')}"] = spec_value
                    except NoSuchElementException: continue
            except (NoSuchElementException, TimeoutException):
                print("  - Nije pronađen ili je istekao timeout za 'Vehicle Information' sekciju.")

            if mileage_detailed_value is not None:
                car_details['Mileage'] = mileage_detailed_value
            elif mileage_snapshot_value is not None:
                car_details['Mileage'] = mileage_snapshot_value
            else:
                car_details['Mileage'] = None

            try:
                show_more_button_features = driver.find_element(By.XPATH, "//div[contains(@class, 'key-features')]//button[contains(text(), 'Show more')]")
                driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", show_more_button_features)
                time.sleep(0.5)
                show_more_button_features.click()
                print("  - Kliknuto 'Show more' za features.")
                time.sleep(ACTION_DELAY)
            except (NoSuchElementException, ElementClickInterceptedException): pass
            except Exception as e_show_more: print(f"  - Greška pri interakciji s 'Show more' za features: {e_show_more}")

            feature_sections_map = { "Interior": "Interior_Features", "Exterior": "Exterior_Features", "Safety": "Safety_Features", "Other": "Other_Features" }
            try:
                feature_containers = driver.find_elements(By.XPATH, "//div[contains(@class, 'key-features')]//div[contains(@class, 'feature-container')]")
                if not feature_containers:
                    feature_containers = driver.find_elements(By.XPATH, "//div[contains(@class, 'feature-group')]//div[contains(@class, 'feature-container')]")

                for container_div in feature_containers:
                    try:
                        section_title_el = container_div.find_element(By.XPATH, ".//div[contains(@class, 'title')]")
                        section_title_text = clean_text(section_title_el.text)
                        if section_title_text in feature_sections_map:
                            ul_features = container_div.find_element(By.XPATH, ".//ul[@data-testid='feature-list']")
                            features_list = [clean_text(li.text) for li in ul_features.find_elements(By.TAG_NAME, 'li') if clean_text(li.text)]
                            car_details[feature_sections_map[section_title_text]] = features_list
                    except NoSuchElementException: continue
                    except Exception as e_feature_item: print(f"    - Greška pri parsiranju stavke feature u sekciji '{section_title_text}': {e_feature_item}")
            except NoSuchElementException: print("  - Nisu pronađeni 'key-features' kontejneri.")

            all_scraped_car_data.append(car_details)
            processed_vdp_links.add(vdp_url)
            links_scraped_this_session += 1
            print(f"  Uspješno scrapano: {car_details.get('Name', 'N/A')} - VIN: {car_details.get('VIN', 'N/A')}")

        except TimeoutException:
            print(f"  - Timeout pri čekanju ključnih elemenata stranice za URL: {vdp_url}")
            processed_vdp_links.add(vdp_url)
        except Exception as e_detail:
            print(f"  - Greška pri scrapanju detalja za {vdp_url}: {e_detail}")
            processed_vdp_links.add(vdp_url)

        if links_scraped_this_session > 0 and (links_scraped_this_session % 5 == 0 or links_scraped_this_session == MAX_ADS_TO_SCRAPE):
            save_data_to_json(all_scraped_car_data, OUTPUT_DATA_JSON_PATH)
            print(f"--- Napredak spremljen nakon obrade {links_scraped_this_session} novih automobila u ovoj sesiji ---")

        time.sleep(REQUEST_DELAY_DETAIL_PAGE)

except KeyboardInterrupt:
    print("\nProces scrapinga prekinut od strane korisnika.")
except Exception as e_main:
    print(f"Kritična greška u glavnom procesu scrapinga: {e_main}")
finally:
    if driver:
        driver.quit()
        print("WebDriver zatvoren.")

    print("\n--- Scraping pojedinačnih oglasa završen (ili prekinut) ---")
    if all_scraped_car_data:
        valid_data_for_df = [item for item in all_scraped_car_data if isinstance(item, dict)]
        if valid_data_for_df:
            final_df = pd.DataFrame(valid_data_for_df)
            print(f"Ukupno jedinstvenih automobila u datasetu: {len(final_df)}")
            if not final_df.empty:
                print("\nPrimjer scrapanih podataka (prvih 5 redaka):")
                print(final_df.head().to_string())
            else:
                print("DataFrame je prazan (nema valjanih podataka za prikaz).")
        else:
            print("Nema valjanih dictionary zapisa za kreiranje DataFrame-a.")

        print(f"\nSpremanje konačnih podataka u {OUTPUT_DATA_JSON_PATH}...")
        save_data_to_json(all_scraped_car_data, OUTPUT_DATA_JSON_PATH)
        print("Svi podaci spremljeni.")
    else:
        print("Novi podaci o automobilima nisu uspješno scrapani u ovoj sesiji, ili podaci nisu učitani iz datoteke.")

Učitavanje linkova iz autolist_car_links.json...
Učitano 104399 linkova, 104399 jedinstvenih originalnih linkova za obradu.
Učitano 19168 prethodno scrapanih detalja automobila. Već obrađeno 19168 VDP linkova.

Obrada oglasa 1 (Ukupni index 999/104399):
  Originalni URL iz datoteke: https://www.autolist.com/listings#door_count[]=2&driveline[]=4X4&exclude_no_price=true&exclude_regional=true&limit=20&page=133&radius=Any&vin=1J4FA39SX5P324840
  Koristim VDP URL: https://www.autolist.com/listings/1J4FA39SX5P324840
  - Timeout pri čekanju ključnih elemenata stranice za URL: https://www.autolist.com/listings/1J4FA39SX5P324840

Obrada oglasa 1 (Ukupni index 1034/104399):
  Originalni URL iz datoteke: https://www.autolist.com/listings#door_count[]=2&driveline[]=4X4&exclude_no_price=true&exclude_regional=true&limit=20&page=135&radius=Any&vin=1FMDE5CP8NLB81861
  Koristim VDP URL: https://www.autolist.com/listings/1FMDE5CP8NLB81861
  - Timeout pri čekanju ključnih elemenata stranice za URL: https