In [1]:
# Task: Collect car listing URLs from the website.
# Steps:
# 1. Identify and gather all buttons that contain car listing URLs.
# 2. Extract and construct the full URLs from the relevant attributes.
# 3. Store each URL in a list for further processing.
# Next task: Use the collected URLs to visit each listing and extract detailed information (e.g., car specs, prices, contact details).


In [67]:
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium import webdriver
import time

service = Service()
main_driver = webdriver.Chrome(service=service)
main_driver.get("https://avtoelon.uz/uz/avto/?auto-fuel=1")

time.sleep(3)

buttons = main_driver.find_elements(By.XPATH, '//button[contains(@class, "js__advert-button")]')
urls = []

# Collect URLs
for button in buttons:
    partial_url = button.get_attribute('data-url')
    url = f"https://avtoelon.uz{partial_url}"
    urls.append(url)  # Append the URL to the list


In [None]:
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium import webdriver
import time
import pandas as pd

# ---------- STEP 1: Collect all listing URLs ----------
def get_listing_urls(driver, fuel_type):
    page = 1
    all_urls = []

    while True:
        try:
            url = f"https://avtoelon.uz/uz/avto/?auto-fuel={fuel_type}&page={page}"
            driver.get(url)
            time.sleep(2)

            buttons = driver.find_elements(By.XPATH, '//button[contains(@class, "js__advert-button")]')
            if not buttons:
                print(f"  No more listings on page {page}.")
                break

            for button in buttons:
                partial_url = button.get_attribute('data-url')
                if partial_url:
                    full_url = f"https://avtoelon.uz{partial_url}"
                    all_urls.append(full_url)

            print(f"  Page {page} scraped with {len(buttons)} listings.")
            page += 1

        except Exception as e:
            print(f"  Error on page {page}: {e}. Continuing...")
            page += 1
            continue

    return all_urls

# ---------- STEP 2: Scrape individual car data ----------
def scrape_listing_data(driver, urls):
    data = {
        "brands": [],
        "names": [],
        "prices": [],
        "cities": [],
        "engines": [],
        "kuzovs": [],
        "milages": [],
        "driving_types": [],
        "colors": [],
        "paint_info": [],
        "uzatmalar": [],
        "negotiations": [],
        "exteriors": [],
        "optika": [],
        "audioes": [],
        "optsiyalar": [],
        "additionals": [],
        "phone_numbers": [],
        "photos": []  # Add new column for photos
    }

    unread_urls = []
    
    def safe_get_text(xpath):
        try:
            return driver.find_element(By.XPATH, xpath).text
        except:
            return ""

    # ---------- STEP 3: Scraping process ----------
    for full_url in urls:
        print(f"Scraping: {full_url}")
        driver.get(full_url)
        time.sleep(2)

        result = {}

        # Step 1: Try to fetch phone number first
        try:
            phone_button = driver.find_element(By.XPATH, '//div[contains(@class, "phone")]//ul')
            phone_button.click()
            time.sleep(3)
            result["phone"] = phone_button.text
        except:
            result["phone"] = ""
            unread_urls.append(full_url)
            continue  # Skip the rest of the scraping for this listing if no phone number

        # Step 2: If phone number is found, proceed with scraping other data
        result["brand"] = safe_get_text('//header//h1/span[1]')
        result["name"] = safe_get_text('//header//h1/span[2]')
        result["price"] = safe_get_text('//header/div[2]')
        result["city"] = safe_get_text('//section/div/div[1]/div[1]/div/dl/dd[1]')
        result["engine"] = safe_get_text('//section/div/div[2]/div[1]/div/dl/dd[3]')
        result["kuzov"] = safe_get_text('//section/div/div[2]/div[1]/div/dl/dd[4]')
        result["milage"] = safe_get_text('//section/div/div[2]/div[1]/div/dl/dd[5]')
        result["driving_type"] = safe_get_text('//section/div/div[2]/div[1]/div/dl/dd[6]')
        result["color"] = safe_get_text('//section/div/div[2]/div[1]/div/dl/dd[7]')
        result["paint_info"] = safe_get_text('//section/div/div[2]/div[1]/div/dl/dd[8]')
        result["uzatma"] = safe_get_text('//section/div/div[2]/div[1]/div/dl/dd[9]')
        result["negotiation"] = safe_get_text('//section/div/div[2]/div[1]/div/dl/dd[10]')

        try:
            driver.find_element(By.ID, "js-params-block").click()
            time.sleep(1)
        except:
            pass

        result["exterior"] = safe_get_text('//*[@id="js-params-block"]/ul/li[1]')
        result["optika"] = safe_get_text('//*[@id="js-params-block"]/ul/li[2]')
        result["audio"] = safe_get_text('//*[@id="js-params-block"]/ul/li[3]')
        result["optsiyalar"] = safe_get_text('//*[@id="js-params-block"]/ul/li[4]')
        result["additional"] = safe_get_text('//*[@id="js-params-block"]/ul/li[5]')

        # ---------- Step 3.1: Scrape photos ----------
        try:
            photo_elements = driver.find_elements(By.XPATH, '/html/body/div[1]/div[1]/main/div/div/section/div/div[1]/div[2]/ul/li/img')
            photos = [photo.get_attribute('src') for photo in photo_elements]
            result["photos"] = "; ".join(photos)  # Join all photo URLs with semicolon
        except:
            result["photos"] = ""

        # Add phone number only if it exists
        data["phone_numbers"].append(result["phone"])

        # Append other data
        for key in data:
            if key != "phone_numbers":
                col = key if key != "phone_numbers" else "phone"
                data[key].append(result.get(col, ""))

    # ---------- STEP 4: Retry logic for missing phone numbers ----------
    retries = 5
    attempt = 0
    while attempt < retries and unread_urls:
        print(f"Retrying phone numbers... Attempt {attempt + 1}/{retries}")
        new_unread_urls = []

        for url in unread_urls:
            print(f"Retrying: {url}")
            driver.get(url)
            time.sleep(2)

            try:
                phone_button = driver.find_element(By.XPATH, '//div[contains(@class, "phone")]//ul')
                phone_button.click()
                time.sleep(1)
                phone_number = phone_button.text
                data["phone_numbers"].append(phone_number)
            except:
                new_unread_urls.append(url)

        unread_urls = new_unread_urls
        attempt += 1
        time.sleep(40)

    print(f"Finished retrying. Ignored {len(unread_urls)} listings with missing phone numbers.")

    return data

if __name__ == "__main__":
    service = Service()
    driver = webdriver.Chrome(service=service)

    fuel_type = 1  # Change this as needed
    urls = get_listing_urls(driver, fuel_type)
    print(f"Collected {len(urls)} listing URLs.")

    scraped_data = scrape_listing_data(driver, urls)

    df = pd.DataFrame(scraped_data)
    df.to_csv("scraped_car_data.csv", index=False)

    driver.quit()
    
# just a quick note i have mistakenly quited the driver and now
# since i have the urls' like saved i can definately use it


  Page 1 scraped with 20 listings.
  Page 2 scraped with 20 listings.
  Page 3 scraped with 20 listings.
  Page 4 scraped with 20 listings.
  Page 5 scraped with 20 listings.
  Page 6 scraped with 20 listings.
  Page 7 scraped with 20 listings.
  Page 8 scraped with 20 listings.
  Page 9 scraped with 20 listings.
  Page 10 scraped with 20 listings.
  Page 11 scraped with 20 listings.
  Page 12 scraped with 20 listings.
  Page 13 scraped with 20 listings.
  Page 14 scraped with 20 listings.
  Page 15 scraped with 20 listings.
  Page 16 scraped with 20 listings.
  Page 17 scraped with 20 listings.
  Page 18 scraped with 20 listings.
  Page 19 scraped with 20 listings.
  Page 20 scraped with 20 listings.
  Page 21 scraped with 20 listings.
  Page 22 scraped with 20 listings.
  Page 23 scraped with 20 listings.
  Page 24 scraped with 20 listings.
  Page 25 scraped with 20 listings.
  Page 26 scraped with 20 listings.
  Page 27 scraped with 20 listings.
  Page 28 scraped with 20 listings.
 

  Page 202 scraped with 20 listings.
  Page 203 scraped with 20 listings.
  Page 204 scraped with 20 listings.
  Page 205 scraped with 20 listings.
  Page 206 scraped with 20 listings.
  Page 207 scraped with 20 listings.
  Page 208 scraped with 20 listings.
  Page 209 scraped with 20 listings.
  Page 210 scraped with 20 listings.
  Page 211 scraped with 20 listings.
  Page 212 scraped with 20 listings.
  Page 213 scraped with 20 listings.
  Page 214 scraped with 20 listings.
  Page 215 scraped with 20 listings.
  Page 216 scraped with 20 listings.
  Page 217 scraped with 20 listings.
  Page 218 scraped with 20 listings.
  Page 219 scraped with 20 listings.
  Page 220 scraped with 20 listings.
  Page 221 scraped with 20 listings.
  Page 222 scraped with 20 listings.
  Page 223 scraped with 20 listings.
  Page 224 scraped with 20 listings.
  Page 225 scraped with 20 listings.
  Page 226 scraped with 20 listings.
  Page 227 scraped with 20 listings.
  Page 228 scraped with 20 listings.
 

  Page 421 scraped with 20 listings.
  Page 422 scraped with 20 listings.
  Page 423 scraped with 20 listings.
  Page 424 scraped with 20 listings.
  Page 425 scraped with 20 listings.
  Page 426 scraped with 20 listings.
  Page 427 scraped with 20 listings.
  Page 428 scraped with 20 listings.
  Page 429 scraped with 20 listings.
  Page 430 scraped with 20 listings.
  Page 431 scraped with 20 listings.
  Page 432 scraped with 20 listings.
  Page 433 scraped with 20 listings.
  Page 434 scraped with 20 listings.
  Page 435 scraped with 20 listings.
  Page 436 scraped with 20 listings.
  Page 437 scraped with 20 listings.
  Page 438 scraped with 20 listings.
  Page 439 scraped with 20 listings.
  Page 440 scraped with 20 listings.
  Page 441 scraped with 20 listings.
  Page 442 scraped with 20 listings.
  Page 443 scraped with 20 listings.
  Page 444 scraped with 20 listings.
  Page 445 scraped with 20 listings.
  Page 446 scraped with 20 listings.
  Page 447 scraped with 20 listings.
 

  Page 643 scraped with 20 listings.
  Page 644 scraped with 20 listings.
  Page 645 scraped with 20 listings.
  Page 646 scraped with 20 listings.
  Page 647 scraped with 20 listings.
  Page 648 scraped with 20 listings.
  Page 649 scraped with 20 listings.
  Page 650 scraped with 20 listings.
  Page 651 scraped with 20 listings.
  Page 652 scraped with 20 listings.
  Page 653 scraped with 20 listings.
  Page 654 scraped with 20 listings.
  Page 655 scraped with 20 listings.
  Page 656 scraped with 20 listings.
  Page 657 scraped with 20 listings.
  Page 658 scraped with 20 listings.
  Page 659 scraped with 20 listings.
  Page 660 scraped with 20 listings.
  Page 661 scraped with 20 listings.
  Page 662 scraped with 20 listings.
  Page 663 scraped with 20 listings.
  Page 664 scraped with 20 listings.
  Page 665 scraped with 20 listings.
  Page 666 scraped with 20 listings.
  Page 667 scraped with 20 listings.
  Page 668 scraped with 20 listings.
  Page 669 scraped with 20 listings.
 