In [1]:
import json
import time
import threading
import concurrent.futures
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException, TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import os


class CarScraper:
    """
    Scraper for avtoelon.uz car listings for specified fuel types.
    Scrapes listing URLs page by page for each fuel type, then scrapes each listing concurrently.
    Retries fetching listing URLs for a page up to 5 times if no listings are found.
    Saves output as JSON Lines (one JSON object per line) with a "fuel_type" field, appending to the existing file.
    """

    BASE_URL = "https://avtoelon.uz"
    PAGE_FETCH_RETRIES = 5
    PAGE_FETCH_RETRY_DELAY = 5  # seconds

    def __init__(self, fuel_types=(1,), json_filename="avto_elon_data.jsonl", url_log="scraped_urls.json"):
        if not isinstance(fuel_types, (list, tuple)):
            raise ValueError("fuel_types must be a list or tuple of integers.")
        self.fuel_types = fuel_types
        self.json_filename = json_filename
        self.url_log = url_log

        self.lock = threading.Lock()
        self.all_scraped_urls = self._load_existing_urls("scraped")
        self.all_failed_urls = self._load_existing_urls("failed")

        # Create log file if it doesn't exist
        if not os.path.exists(self.url_log):
            with open(self.url_log, "w", encoding="utf-8") as f:
                json.dump({"scraped": {}, "failed": {}}, f)

    def _load_existing_urls(self, key):
        if os.path.exists(self.url_log):
            try:
                with open(self.url_log, "r", encoding="utf-8") as f:
                    data = json.load(f)
                    return data.get(key, {})
            except json.JSONDecodeError:
                print(f"[WARNING] Could not decode existing {self.url_log}. Starting with empty logs.")
                return {}
        return {}

    def _create_driver(self):
        options = Options()
        options.add_argument("--headless=new")
        options.add_argument("--disable-gpu")
        options.add_argument("--no-sandbox")
        options.add_argument("--window-size=1920,1080")
        options.add_argument("--disable-dev-shm-usage")
        return webdriver.Chrome(service=Service(), options=options)

    def _safe_get_text(self, driver, by, selector, timeout=5):
        try:
            element = WebDriverWait(driver, timeout).until(EC.presence_of_element_located((by, selector)))
            return element.text.strip()
        except TimeoutException:
            return ""

    def _get_listing_urls(self, driver, page_num, fuel_type):
        url = f"{self.BASE_URL}/uz/avto/?auto-fuel={fuel_type}&page={page_num}"
        driver.get(url)
        time.sleep(1)

        urls = []
        try:
            buttons = WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.CLASS_NAME, "js__advert-button"))
            )
            for btn in buttons:
                partial_url = btn.get_attribute("data-url")
                if partial_url:
                    full_url = self.BASE_URL + partial_url
                    if fuel_type not in self.all_scraped_urls or full_url not in self.all_scraped_urls[fuel_type]:
                        urls.append(full_url)
        except TimeoutException:
            print(f"[INFO - Fuel {fuel_type}] No listing buttons found on page {page_num}.")
        except Exception as e:
            print(f"[ERROR - Fuel {fuel_type}] Failed to extract listing URLs on page {page_num}: {e}")

        return urls

    def _click_phone_button(self, driver, timeout=10):
        try:
            phone_button = WebDriverWait(driver, timeout).until(
                EC.presence_of_element_located((By.CLASS_NAME, "phone"))
            )
            ul_element = WebDriverWait(phone_button, timeout).until(
                EC.presence_of_element_located((By.TAG_NAME, "ul"))
            )
            ul_element.click()
            WebDriverWait(phone_button, timeout).until(
                lambda d: any(char.isdigit() for char in phone_button.text) or "..." not in phone_button.text
            )
            return phone_button.text.strip()
        except (NoSuchElementException, ElementClickInterceptedException, TimeoutException):
            return ""

    def _extract_description_params(self, driver, timeout=5):
        params = {}
        try:
            description_block = WebDriverWait(driver, timeout).until(
                EC.presence_of_element_located((By.CLASS_NAME, "description-params"))
            )
            dts = description_block.find_elements(By.TAG_NAME, "dt")
            dds = description_block.find_elements(By.TAG_NAME, "dd")

            for dt, dd in zip(dts, dds):
                key = dt.text.strip()
                val = dd.text.strip()
                params[key] = val
        except TimeoutException:
            pass
        except NoSuchElementException:
            pass
        return params

    def _extract_photos(self, driver, timeout=5):
        photos = []
        try:
            photo_elements = WebDriverWait(driver, timeout).until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div.main-photo img"))
            )
            for img in photo_elements:
                src = img.get_attribute("src")
                if src:
                    photos.append(src)
        except TimeoutException:
            pass
        except NoSuchElementException:
            pass
        return photos

    def _scrape_listing(self, url, fuel_type, retry=0, max_retries=3):
        driver = self._create_driver()
        data = {"source_url": url, "fuel_type": fuel_type}
        try:
            driver.get(url)
            time.sleep(1)

            data["phone_number"] = self._click_phone_button(driver)

            full_title = self._safe_get_text(driver, By.CSS_SELECTOR, "header h1")
            if full_title:
                parts = full_title.split(" ", 1)
                data["brand"] = parts[0]
                data["name"] = parts[1] if len(parts) > 1 else ""
            else:
                data["brand"] = ""
                data["name"] = ""

            data["price"] = self._safe_get_text(driver, By.CSS_SELECTOR, "header div.price")
            data["description_params"] = self._extract_description_params(driver)
            data["photos"] = self._extract_photos(driver)

            with self.lock:
                with open(self.json_filename, "a", encoding="utf-8") as f:
                    f.write(json.dumps(data, ensure_ascii=False) + "\n")
                if fuel_type not in self.all_scraped_urls:
                    self.all_scraped_urls[fuel_type] = []
                self.all_scraped_urls[fuel_type].append(url)

        except Exception as e:
            print(f"[ERROR - Fuel {fuel_type}] Failed scraping {url}: {e}")
            if retry < max_retries:
                print(f"[RETRY - Fuel {fuel_type}] Retrying {url} ({retry + 1}/{max_retries})")
                driver.quit()
                time.sleep(2)
                self._scrape_listing(url, fuel_type, retry + 1)
                return
            else:
                with self.lock:
                    if fuel_type not in self.all_failed_urls:
                        self.all_failed_urls[fuel_type] = []
                    self.all_failed_urls[fuel_type].append(url)
        finally:
            driver.quit()

    def scrape_fuel_type(self, fuel_type, start_page=1, max_threads=5):
        page_num = start_page
        scraped_urls_fuel = self.all_scraped_urls.get(fuel_type, [])
        failed_urls_fuel = self.all_failed_urls.get(fuel_type, [])

        while True:
            found_listings = False
            for attempt in range(self.PAGE_FETCH_RETRIES):
                print(f"[INFO - Fuel {fuel_type}] Scraping page {page_num} (Attempt {attempt + 1})...")
                driver = self._create_driver()
                urls = self._get_listing_urls(driver, page_num, fuel_type)
                driver.quit()

                if urls:
                    found_listings = True
                    break
                else:
                    print(f"[WARNING - Fuel {fuel_type}] No new listings found on page {page_num} (Attempt {attempt + 1}). Retrying in {self.PAGE_FETCH_RETRY_DELAY} seconds...")
                    time.sleep(self.PAGE_FETCH_RETRY_DELAY)

            if not found_listings:
                print(f"[INFO - Fuel {fuel_type}] No new listings found after {self.PAGE_FETCH_RETRIES} retries on page {page_num}. Moving to next fuel type.")
                break

            urls_to_scrape = [url for url in urls if url not in scraped_urls_fuel and url not in failed_urls_fuel]

            if urls_to_scrape:
                with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor:
                    futures = [executor.submit(self._scrape_listing, url, fuel_type) for url in urls_to_scrape]
                    for future in concurrent.futures.as_completed(futures):
                        pass

                print(f"[INFO - Fuel {fuel_type}] Finished scraping page {page_num} ({len(urls_to_scrape)} new listings).")
                scraped_urls_fuel.extend(urls_to_scrape)
            else:
                print(f"[INFO - Fuel {fuel_type}] No new listings to scrape on page {page_num}.")

            page_num += 1
            time.sleep(1)
        return list(set(scraped_urls_fuel)), list(set(failed_urls_fuel))

    def scrape_all(self, max_threads=5):
        for i, fuel_type in enumerate(self.fuel_types):
            print(f"\n{'='*30} Scraping Fuel Type: {fuel_type} {'='*30}")
            start_page = 1
            if i == 0:
                start_page = 50
            scraped, failed = self.scrape_fuel_type(fuel_type, start_page=start_page, max_threads=max_threads)
            self.all_scraped_urls[fuel_type] = scraped
            if failed:
                self.all_failed_urls[fuel_type] = failed

        with open(self.url_log, "w", encoding="utf-8") as f:
            json.dump({"scraped": self.all_scraped_urls, "failed": self.all_failed_urls}, f, ensure_ascii=False, indent=2)

        total_scraped = sum(len(urls) for urls in self.all_scraped_urls.values())
        total_failed = sum(len(urls) for urls in self.all_failed_urls.values())

        print(f"\n[DONE] Total scraped: {total_scraped} | Total failed: {total_failed}")
        print(f"[INFO] Logs saved to {self.url_log}")


if __name__ == "__main__":
    fuel_types_to_scrape = [1, 2, 3, 4]
    scraper = CarScraper(fuel_types=fuel_types_to_scrape)
    scraper.scrape_all(max_threads=5)


[INFO - Fuel 1] Scraping page 50 (Attempt 1)...
[INFO - Fuel 1] Finished scraping page 50 (20 new listings).
[INFO - Fuel 1] Scraping page 51 (Attempt 1)...
[INFO - Fuel 1] Finished scraping page 51 (6 new listings).
[INFO - Fuel 1] Scraping page 52 (Attempt 1)...
[INFO - Fuel 1] Finished scraping page 52 (18 new listings).
[INFO - Fuel 1] Scraping page 53 (Attempt 1)...
[INFO - Fuel 1] Finished scraping page 53 (6 new listings).
[INFO - Fuel 1] Scraping page 54 (Attempt 1)...
[INFO - Fuel 1] Finished scraping page 54 (11 new listings).
[INFO - Fuel 1] Scraping page 55 (Attempt 1)...
[INFO - Fuel 1] Finished scraping page 55 (9 new listings).
[INFO - Fuel 1] Scraping page 56 (Attempt 1)...
[INFO - Fuel 1] Finished scraping page 56 (14 new listings).
[INFO - Fuel 1] Scraping page 57 (Attempt 1)...
[INFO - Fuel 1] Scraping page 57 (Attempt 2)...
[INFO - Fuel 1] Scraping page 57 (Attempt 3)...
[INFO - Fuel 1] Scraping page 57 (Attempt 4)...
[INFO - Fuel 1] Scraping page 57 (Attempt 5)...

[INFO - Fuel 2] Scraping page 12 (Attempt 3)...
[INFO - Fuel 2] Scraping page 12 (Attempt 4)...
[INFO - Fuel 2] Scraping page 12 (Attempt 5)...
[INFO - Fuel 2] No new listings found after 5 retries on page 12. Moving to next fuel type.

[INFO - Fuel 3] Scraping page 1 (Attempt 1)...
[INFO - Fuel 3] Finished scraping page 1 (20 new listings).
[INFO - Fuel 3] Scraping page 2 (Attempt 1)...
[INFO - Fuel 3] Finished scraping page 2 (14 new listings).
[INFO - Fuel 3] Scraping page 3 (Attempt 1)...
[INFO - Fuel 3] Finished scraping page 3 (16 new listings).
[INFO - Fuel 3] Scraping page 4 (Attempt 1)...
[INFO - Fuel 3] Finished scraping page 4 (14 new listings).
[INFO - Fuel 3] Scraping page 5 (Attempt 1)...
[INFO - Fuel 3] Finished scraping page 5 (17 new listings).
[INFO - Fuel 3] Scraping page 6 (Attempt 1)...
[INFO - Fuel 3] Finished scraping page 6 (18 new listings).
[INFO - Fuel 3] Scraping page 7 (Attempt 1)...
[INFO - Fuel 3] Finished scraping page 7 (18 new listings).
[INFO - Fuel 3

[ERROR - Fuel 3] Failed scraping https://avtoelon.uz/uz/a/show/6233616: Message: timeout: Timed out receiving message from renderer: -0.010
  (Session info: chrome=136.0.7103.113)
Stacktrace:
#0 0x5603af40ebbe <unknown>
#1 0x5603aeecbbcb <unknown>
#2 0x5603aeeb73c6 <unknown>
#3 0x5603aeeb717c <unknown>
#4 0x5603aeeb56c8 <unknown>
#5 0x5603aeeb5b79 <unknown>
#6 0x5603aeec3220 <unknown>
#7 0x5603aeed95bf <unknown>
#8 0x5603aeedeb0b <unknown>
#9 0x5603aeeb60ed <unknown>
#10 0x5603aeed93b7 <unknown>
#11 0x5603aef60db7 <unknown>
#12 0x5603aef3fa73 <unknown>
#13 0x5603aef09d19 <unknown>
#14 0x5603aef0aac8 <unknown>
#15 0x5603af3db53a <unknown>
#16 0x5603af3de98e <unknown>
#17 0x5603af3de438 <unknown>
#18 0x5603af3dee15 <unknown>
#19 0x5603af3c4cab <unknown>
#20 0x5603af3df180 <unknown>
#21 0x5603af3adc19 <unknown>
#22 0x5603af3fdf25 <unknown>
#23 0x5603af3fe10b <unknown>
#24 0x5603af40dd35 <unknown>
#25 0x7f0c9d6a81f5 <unknown>

[RETRY - Fuel 3] Retrying https://avtoelon.uz/uz/a/show/6233616

[ERROR - Fuel 3] Failed scraping https://avtoelon.uz/uz/a/show/6316293: Message: timeout: Timed out receiving message from renderer: -0.014
  (Session info: chrome=136.0.7103.113)
Stacktrace:
#0 0x56181c68ebbe <unknown>
#1 0x56181c14bbcb <unknown>
#2 0x56181c1373c6 <unknown>
#3 0x56181c13717c <unknown>
#4 0x56181c1356c8 <unknown>
#5 0x56181c135b79 <unknown>
#6 0x56181c143220 <unknown>
#7 0x56181c1595bf <unknown>
#8 0x56181c15eb0b <unknown>
#9 0x56181c1360ed <unknown>
#10 0x56181c1593b7 <unknown>
#11 0x56181c1e0db7 <unknown>
#12 0x56181c1bfa73 <unknown>
#13 0x56181c189d19 <unknown>
#14 0x56181c18aac8 <unknown>
#15 0x56181c65b53a <unknown>
#16 0x56181c65e98e <unknown>
#17 0x56181c65e438 <unknown>
#18 0x56181c65ee15 <unknown>
#19 0x56181c644cab <unknown>
#20 0x56181c65f180 <unknown>
#21 0x56181c62dc19 <unknown>
#22 0x56181c67df25 <unknown>
#23 0x56181c67e10b <unknown>
#24 0x56181c68dd35 <unknown>
#25 0x7f64f90a81f5 <unknown>

[INFO - Fuel 3] Finished scraping page 14 (18 new listings).
[I

[ERROR - Fuel 3] Failed scraping https://avtoelon.uz/uz/a/show/6326042: Message: timeout: Timed out receiving message from renderer: -1147.095
  (Session info: chrome=136.0.7103.113)
Stacktrace:
#0 0x5642fd78fbbe <unknown>
#1 0x5642fd24cbcb <unknown>
#2 0x5642fd2383c6 <unknown>
#3 0x5642fd23817c <unknown>
#4 0x5642fd2366c8 <unknown>
#5 0x5642fd236b79 <unknown>
#6 0x5642fd244220 <unknown>
#7 0x5642fd25a5bf <unknown>
#8 0x5642fd25fb0b <unknown>
#9 0x5642fd2370ed <unknown>
#10 0x5642fd25a3b7 <unknown>
#11 0x5642fd2e1db7 <unknown>
#12 0x5642fd2c0a73 <unknown>
#13 0x5642fd28ad19 <unknown>
#14 0x5642fd28bac8 <unknown>
#15 0x5642fd75c53a <unknown>
#16 0x5642fd75f98e <unknown>
#17 0x5642fd75f438 <unknown>
#18 0x5642fd75fe15 <unknown>
#19 0x5642fd745cab <unknown>
#20 0x5642fd760180 <unknown>
#21 0x5642fd72ec19 <unknown>
#22 0x5642fd77ef25 <unknown>
#23 0x5642fd77f10b <unknown>
#24 0x5642fd78ed35 <unknown>
#25 0x7f74842a81f5 <unknown>

[RETRY - Fuel 3] Retrying https://avtoelon.uz/uz/a/show/6326

TimeoutException: Message: timeout: Timed out receiving message from renderer: -0.031
  (Session info: chrome=136.0.7103.113)
Stacktrace:
#0 0x55cee6209bbe <unknown>
#1 0x55cee5cc6bcb <unknown>
#2 0x55cee5cb23c6 <unknown>
#3 0x55cee5cb217c <unknown>
#4 0x55cee5cb06c8 <unknown>
#5 0x55cee5cb0b79 <unknown>
#6 0x55cee5cbe220 <unknown>
#7 0x55cee5cd45bf <unknown>
#8 0x55cee5cd9b0b <unknown>
#9 0x55cee5cb10ed <unknown>
#10 0x55cee5cd43b7 <unknown>
#11 0x55cee5d5bdb7 <unknown>
#12 0x55cee5d3aa73 <unknown>
#13 0x55cee5d04d19 <unknown>
#14 0x55cee5d05ac8 <unknown>
#15 0x55cee61d653a <unknown>
#16 0x55cee61d998e <unknown>
#17 0x55cee61d9438 <unknown>
#18 0x55cee61d9e15 <unknown>
#19 0x55cee61bfcab <unknown>
#20 0x55cee61da180 <unknown>
#21 0x55cee61a8c19 <unknown>
#22 0x55cee61f8f25 <unknown>
#23 0x55cee61f910b <unknown>
#24 0x55cee6208d35 <unknown>
#25 0x7f07d32a81f5 <unknown>
