In [1]:
pip install requests beautifulsoup4

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install requests beautifulsoup4 Pillow # Pillow is for basic image handling/viewing

Note: you may need to restart the kernel to use updated packages.


ERROR: Invalid requirement: '#': Expected package name at the start of dependency specifier
    #
    ^


In [4]:
import requests
from bs4 import BeautifulSoup
import os
import time
from urllib.parse import urljoin, urlparse
import re
from PIL import Image # For displaying the image file

# --- CONFIGURATION ---
# 1. *** CHANGE THIS TO YOUR TARGET URL ***
TARGET_URL = 'https://bringatrailer.com/listing/2013-bmw-328i-sedan-12/' 

# Base directory for Stock/Modified folders
BASE_DIR = 'Car_Images_Dataset' 

# 3. SET A USER-AGENT (to avoid getting blocked)
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# Define the two final classification folders
STOCK_DIR = os.path.join(BASE_DIR, 'Stock')
MODIFIED_DIR = os.path.join(BASE_DIR, 'Modified')
TEMP_FILE_PATH = 'temp_download.jpg' # Standard temporary file name

# --- FUNCTIONS ---

def get_image_urls(url, headers):
    """Fetches the page and attempts to extract high-res image URLs from BaT's gallery structure."""
    print(f"Fetching HTML from: {url}")
    try:
        # Use a slightly longer timeout for potentially slower BaT responses
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status() 
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL {url}: {e}")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')
    image_urls = []

    # Strategy 1: Look for URLs within <noscript> tags or data attributes for the full images.
    # BaT often embeds the high-res link in a different element or attribute.
    
    # Target the high-resolution images that are typically placed within the main content area.
    # We look for large .jpg or .png links in all tags, not just <img>.
    
    # Check all tags for 'src' or 'href' attributes that contain image file extensions,
    # often found within the main content area (e.g., div.post-content)
    
    post_content = soup.find('div', class_='post-content')
    if post_content:
        for tag in post_content.find_all(True):
            src = tag.get('src')
            href = tag.get('href')
            url_to_check = src if src and 'wp-content/uploads' in src else href
            
            if url_to_check:
                full_url = urljoin(url, url_to_check)
                # Filter for URLs that are NOT the small thumbnails
                if re.match(r'.*uploads/.*(\.jpg|\.jpeg|\.png)$', full_url, re.IGNORECASE) and 'resize=' not in full_url:
                    # Remove query parameters from the URL for cleaner downloading if needed
                    full_url = full_url.split('?')[0] 
                    if full_url not in image_urls:
                        image_urls.append(full_url)
    
    # Strategy 2: Look for gallery links or large image URLs specifically embedded in the page
    # This might find links that were missed in Strategy 1.
    for link in soup.find_all('a', href=True):
         if re.match(r'.*uploads/.*(\.jpg|\.jpeg|\.png)$', link['href'], re.IGNORECASE):
            full_url = urljoin(url, link['href'])
            full_url = full_url.split('?')[0] # Clean up URL
            if full_url not in image_urls:
                 image_urls.append(full_url)
                 
    # Remove duplicate URLs and return the list
    return list(set(image_urls))

# --- IMPORTANT ---
# You need to REPLACE the old 'get_image_urls' function in your script with the one above.
# The rest of your script (download_images_batch, save_temp_image, and __main__) remains the same.

def save_temp_image(img_response):
    """Saves the image content to a temporary file and returns its extension."""
    
    # Get the file extension from the Content-Type header
    content_type = img_response.headers.get('Content-Type', '').lower()
    ext_match = re.search(r'image/(\w+)', content_type)
    ext = ext_match.group(1) if ext_match else 'jpg'
    
    # Update temp file path with correct extension
    temp_file = f"temp_download.{ext}"
    
    with open(temp_file, 'wb') as f:
        for chunk in img_response.iter_content(1024):
            f.write(chunk)
    return temp_file, ext

def download_images_batch(image_urls, headers, car_name, global_label, final_dir):
    """Downloads images using the pre-set batch labels."""
    
    print(f"\n--- Batch Settings ---")
    print(f"Car: {car_name} | Label: {global_label} | Destination: {final_dir}")
    print(f"Found {len(image_urls)} potential images. Starting download...")
    
    for i, img_url in enumerate(image_urls):
        try:
            print("-" * 50)
            print(f"Processing Image [{i+1}/{len(image_urls)}]: {img_url}")

            # 1. Download the image content
            img_response = requests.get(img_url, headers=headers, stream=True, timeout=10)
            img_response.raise_for_status()

            # 2. Save the image to the temporary file
            temp_path, ext = save_temp_image(img_response)
            
            # 3. Display the image for quick verification
            try:
                img = Image.open(temp_path)
                img.show() # Opens image in default viewer
                print(f"Image opened. Please confirm it is an EXTERIOR shot of the {car_name}.")
            except Exception as e:
                print(f"Could not open image automatically. Please verify {os.path.abspath(temp_path)} manually.")
            
            # 4. Interactive Confirmation (Added to skip non-car/interior images)
            confirm = input("Keep this image? (Y/N/SKIP): ").upper()
            if confirm in ['N', 'SKIP']:
                print("Skipping and deleting temporary image.")
                os.remove(temp_path)
                time.sleep(1)
                continue

            # 5. Determine final path and rename
            # Create a unique filename: NAME_LABEL_TIMESTAMP.EXT
            timestamp = int(time.time() * 1000) # Use milliseconds for more uniqueness
            new_filename = f"{car_name}_{global_label}_{timestamp}.{ext}"
            final_path = os.path.join(final_dir, new_filename)
            
            # 6. Rename the temporary file and move it to the final directory
            os.rename(temp_path, final_path)

            print(f"SUCCESS: Saved as '{new_filename}' to {final_dir}")
            
            # *** ESSENTIAL: Add a delay to be respectful ***
            time.sleep(1) 

        except requests.exceptions.RequestException as e:
            print(f"Error downloading {img_url}: {e}")
            if os.path.exists(temp_path): os.remove(temp_path)
            time.sleep(1) 
        except Exception as e:
            print(f"An unexpected error occurred during processing: {e}")
            if os.path.exists(temp_path): os.remove(temp_path)
            time.sleep(1) 
        finally:
            # Ensure the temporary file is removed if it still exists
            if os.path.exists(TEMP_FILE_PATH):
                os.remove(TEMP_FILE_PATH)


# --- MAIN EXECUTION ---

if __name__ == "__main__":
    
    # --- PHASE 1: User Input and Setup ---
    print("--- Dataset Setup ---")
    
    # 1. Get Classification Label
    classification_choice = ""
    while classification_choice not in ['S', 'M']:
        classification_choice = input("Batch Classification: Is this batch [S]tock or [M]odified? ").upper()

    global_label = 'Stock' if classification_choice == 'S' else 'Modified'
    final_dir = STOCK_DIR if classification_choice == 'S' else MODIFIED_DIR
    
    # 2. Get Car Details
    car_make = input("Enter Car Make (e.g., BMW): ").strip()
    car_model = input("Enter Car Model (e.g., M3): ").strip()
    car_year = input("Enter Car Year (e.g., 2024): ").strip()
    
    # 3. Create a clean filename base (e.g., BMW_M3_2024)
    car_name = f"{car_make}_{car_model}_{car_year}".replace(' ', '_').replace('-', '_').strip('_')
    if not car_name:
        print("Error: Car details cannot be empty. Exiting.")
        exit()
        
    # 4. Ensure the necessary folders exist
    os.makedirs(final_dir, exist_ok=True)
    
    # --- PHASE 2: Scraping and Downloading ---
    urls_to_scrape = get_image_urls(TARGET_URL, HEADERS)
    
    if urls_to_scrape:
        download_images_batch(urls_to_scrape, HEADERS, car_name, global_label, final_dir)
    else:
        print("No images found or URL failed. Please check TARGET_URL and try again.")
    
    print("\n" * 3)
    print("=" * 50)
    print(f"✅ Batch Labeling Complete for {car_name} - {global_label}!")
    print(f"Total downloaded files need to be manually counted in the '{final_dir}' folder.")
    print("=" * 50)

--- Dataset Setup ---
Fetching HTML from: https://bringatrailer.com/listing/2013-bmw-328i-sedan-12/
No images found or URL failed. Please check TARGET_URL and try again.




✅ Batch Labeling Complete for bmw_328i_2013 - Stock!
Total downloaded files need to be manually counted in the 'Car_Images_Dataset\Stock' folder.


In [5]:
pip install selenium

Note: you may need to restart the kernel to use updated packages.


In [7]:
import requests
from bs4 import BeautifulSoup
import os
import time
from urllib.parse import urljoin, urlparse
import re
from PIL import Image 

# --- SELENIUM IMPORTS ---
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException

# --- CONFIGURATION ---
TARGET_URL = 'https://bringatrailer.com/listing/2013-bmw-328i-sedan-12/' # Example URL
BASE_DIR = 'Car_Images_Dataset' 
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# Define the two final classification folders
STOCK_DIR = os.path.join(BASE_DIR, 'Stock')
MODIFIED_DIR = os.path.join(BASE_DIR, 'Modified')
TEMP_FILE_PATH = 'temp_download.jpg' 

# --- FUNCTIONS ---

def get_image_urls(url):
    """Launches Selenium to execute JavaScript and extract high-res image URLs."""
    print(f"Fetching HTML (via Selenium) from: {url}")
    
    # --- SELENIUM SETUP (ADJUST THIS PART FOR YOUR DRIVER) ---
    try:
        # Recommended modern way for Chrome Driver
        options = webdriver.ChromeOptions()
        options.add_argument("--headless")  # Run browser without GUI for speed
        options.add_argument("--no-sandbox")
        options.add_argument("--disable-dev-shm-usage")
        
        # If chromedriver is NOT in your PATH, you might need to specify the path
        # service = Service(r'/path/to/your/chromedriver')
        # driver = webdriver.Chrome(service=service, options=options)
        
        # If chromedriver IS in your PATH, this works:
        driver = webdriver.Chrome(options=options)
        
    except WebDriverException as e:
        print("\n--- FATAL ERROR ---")
        print("SELENIUM WEB DRIVER ERROR. Please ensure:")
        print("1. You have installed Selenium (`pip install selenium`).")
        print("2. You have downloaded and correctly set up ChromeDriver (or equivalent) for your browser version.")
        print(f"Original Error: {e}")
        return []

    image_urls = []
    
    try:
        driver.get(url)
        
        # Wait until at least one of the gallery images is loaded (use a common BaT selector)
        WebDriverWait(driver, 20).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.post-content img'))
        )
        
        # Extract the HTML source after JavaScript has executed
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        # Now, use BeautifulSoup on the fully rendered page source
        post_content = soup.find('div', class_='post-content')
        if post_content:
            # Look for the actual image URLs in the 'src' or 'data-src' attributes of <img> tags
            for img_tag in post_content.find_all('img'):
                # Prioritize 'data-src' or full-res attributes if available, otherwise use 'src'
                src = img_tag.get('src')
                
                if src:
                    full_url = urljoin(url, src)
                    
                    # Ensure we are getting the large image, not small thumbnails
                    if re.match(r'.*uploads/.*(\.jpg|\.jpeg|\.png)$', full_url, re.IGNORECASE) and 'resize=' not in full_url:
                        # Clean up URL (remove query parameters)
                        full_url = full_url.split('?')[0]
                        image_urls.append(full_url)
                        
    except TimeoutException:
        print("Timed out waiting for page elements to load. Check your URL.")
    except Exception as e:
        print(f"An error occurred during Selenium fetching: {e}")
    finally:
        driver.quit() # Always close the browser instance

    # Remove duplicates and return
    return list(set(image_urls))

# The rest of the functions (save_temp_image, download_images_batch) remain the same.

def save_temp_image(img_response):
    """Saves the image content to a temporary file and returns its extension."""
    
    content_type = img_response.headers.get('Content-Type', '').lower()
    ext_match = re.search(r'image/(\w+)', content_type)
    ext = ext_match.group(1) if ext_match else 'jpg'
    
    temp_file = f"temp_download.{ext}"
    
    with open(temp_file, 'wb') as f:
        for chunk in img_response.iter_content(1024):
            f.write(chunk)
    return temp_file, ext

def download_images_batch(image_urls, headers, car_name, global_label, final_dir):
    """Downloads images using the pre-set batch labels."""
    
    print(f"\n--- Batch Settings ---")
    print(f"Car: {car_name} | Label: {global_label} | Destination: {final_dir}")
    print(f"Found {len(image_urls)} potential images. Starting download...")
    
    for i, img_url in enumerate(image_urls):
        try:
            print("-" * 50)
            print(f"Processing Image [{i+1}/{len(image_urls)}]: {img_url}")

            img_response = requests.get(img_url, headers=headers, stream=True, timeout=10)
            img_response.raise_for_status()

            temp_path, ext = save_temp_image(img_response)
            
            try:
                img = Image.open(temp_path)
                img.show()
                print(f"Image opened. Please confirm it is an EXTERIOR shot of the {car_name}.")
            except Exception as e:
                print(f"Could not open image automatically. Please verify {os.path.abspath(temp_path)} manually.")
            
            confirm = input("Keep this image? (Y/N/SKIP): ").upper()
            if confirm in ['N', 'SKIP']:
                print("Skipping and deleting temporary image.")
                os.remove(temp_path)
                time.sleep(1)
                continue

            timestamp = int(time.time() * 1000)
            new_filename = f"{car_name}_{global_label}_{timestamp}.{ext}"
            final_path = os.path.join(final_dir, new_filename)
            
            os.rename(temp_path, final_path)

            print(f"SUCCESS: Saved as '{new_filename}' to {final_dir}")
            
            time.sleep(1) 

        except requests.exceptions.RequestException as e:
            print(f"Error downloading {img_url}: {e}")
            if os.path.exists(temp_path): os.remove(temp_path)
            time.sleep(1) 
        except Exception as e:
            print(f"An unexpected error occurred during processing: {e}")
            if os.path.exists(temp_path): os.remove(temp_path)
            time.sleep(1) 
        finally:
            if os.path.exists(TEMP_FILE_PATH):
                os.remove(TEMP_FILE_PATH)


# --- MAIN EXECUTION ---

if __name__ == "__main__":
    
    # --- PHASE 1: User Input and Setup ---
    print("--- Dataset Setup ---")
    
    classification_choice = ""
    while classification_choice not in ['S', 'M']:
        classification_choice = input("Batch Classification: Is this batch [S]tock or [M]odified? ").upper()

    global_label = 'Stock' if classification_choice == 'S' else 'Modified'
    final_dir = STOCK_DIR if classification_choice == 'S' else MODIFIED_DIR
    
    car_make = input("Enter Car Make (e.g., BMW): ").strip()
    car_model = input("Enter Car Model (e.g., 328i): ").strip()
    car_year = input("Enter Car Year (e.g., 2013): ").strip()
    
    car_name = f"{car_make}_{car_model}_{car_year}".replace(' ', '_').replace('-', '_').strip('_')
    if not car_name:
        print("Error: Car details cannot be empty. Exiting.")
        exit()
        
    os.makedirs(final_dir, exist_ok=True)
    
    # --- PHASE 2: Scraping and Downloading ---
    # NOTE: The TARGET_URL variable at the top of the script should be set here.
    urls_to_scrape = get_image_urls(TARGET_URL)
    
    if urls_to_scrape:
        download_images_batch(urls_to_scrape, HEADERS, car_name, global_label, final_dir)
    else:
        print("No images found. This is common for dynamic sites. Check your driver setup and URL.")
    
    print("\n" * 3)
    print("=" * 50)
    print(f"✅ Batch Labeling Complete for {car_name} - {global_label}!")
    print(f"Check your labeled data in the '{final_dir}' folder.")
    print("=" * 50)

--- Dataset Setup ---
Fetching HTML (via Selenium) from: https://bringatrailer.com/listing/2013-bmw-328i-sedan-12/
Timed out waiting for page elements to load. Check your URL.
No images found. This is common for dynamic sites. Check your driver setup and URL.




✅ Batch Labeling Complete for BMW_328i_2013 - Stock!
Check your labeled data in the 'Car_Images_Dataset\Stock' folder.


In [8]:
pip install requests beautifulsoup4 Pillow selenium

Note: you may need to restart the kernel to use updated packages.


In [11]:
import requests
from bs4 import BeautifulSoup
import os
import time
from urllib.parse import urljoin, urlparse
import re
from PIL import Image 

# --- SELENIUM IMPORTS ---
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException

# --- CONFIGURATION ---
# 1. *** CHANGE THIS TO YOUR TARGET URL (e.g., a specific BaT listing) ***
TARGET_URL = 'https://bringatrailer.com/listing/2013-bmw-328i-sedan-12/' 

# Base directory for Stock/Modified folders
BASE_DIR = 'Car_Images_Dataset' 

# 3. SET A USER-AGENT for direct image downloads
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# Define the two final classification folders
STOCK_DIR = os.path.join(BASE_DIR, 'Stock')
MODIFIED_DIR = os.path.join(BASE_DIR, 'Modified')
TEMP_FILE_PATH = 'temp_download.jpg' 

# --- FUNCTIONS ---

def get_image_urls(url):
    """Launches Selenium, waits for page to render, and extracts image URLs."""
    print(f"Fetching HTML (via Selenium) from: {url}")
    
    # --- SELENIUM SETUP ---
    try:
        options = webdriver.ChromeOptions()
        # You can try commenting out --headless to visually debug the browser window
        options.add_argument("--headless") 
        options.add_argument("--no-sandbox")
        options.add_argument("--disable-dev-shm-usage")
        options.add_argument("--window-size=1920,1080")
        
        driver = webdriver.Chrome(options=options)
        
    except WebDriverException as e:
        print("\n--- FATAL ERROR ---")
        print("SELENIUM WEB DRIVER ERROR. Please ensure ChromeDriver is correctly installed and in your PATH.")
        print(f"Original Error: {e}")
        return []

    image_urls = []
    
    try:
        driver.get(url)
        
        # 1. Wait for a reliable element (the main title) to load
        print("Waiting up to 30 seconds for main page elements...")
        WebDriverWait(driver, 30).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'h1.post-title'))
        )
        
        # 2. CRUCIAL: Introduce a fixed pause for JavaScript to finish rendering the gallery
        print("Main elements loaded. Waiting 10 seconds for images to render...")
        time.sleep(10) 
        
        # 3. Extract the HTML source after rendering
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        # --- ROBUST EXTRACTION LOGIC ---
        
        # Find ALL <img> tags and filter them down to the large, high-res photos.
        for img_tag in soup.find_all('img'):
            src = img_tag.get('src')
            data_src = img_tag.get('data-src') # Check for lazy-loaded source
            
            # Prioritize data-src if it exists, otherwise use src
            url_to_check = data_src if data_src else src

            if url_to_check:
                full_url = urljoin(url, url_to_check)
                
                # Filter 1: Must contain 'wp-content/uploads' (BaT's file structure)
                # Filter 2: Must be a common image format (.jpg, .jpeg, .png)
                # Filter 3: Must NOT be a thumbnail (check for 'resize=' or very small dimensions)
                
                is_valid_image = (
                    re.search(r'wp-content/uploads', full_url, re.IGNORECASE) and
                    re.match(r'.*\.(jpg|jpeg|png)$', full_url, re.IGNORECASE) and
                    'resize=' not in full_url.lower()
                )

                if is_valid_image:
                    # Clean up URL (remove query parameters if any remain)
                    clean_url = full_url.split('?')[0] 
                    if clean_url not in image_urls:
                         image_urls.append(clean_url)

        # --- END ROBUST EXTRACTION LOGIC ---
        
    except TimeoutException:
        print("Timed out waiting for page elements (30s). The page may be too slow or blocked.")
    except Exception as e:
        print(f"An error occurred during Selenium fetching: {e}")
    finally:
        driver.quit()

    return list(set(image_urls))

def save_temp_image(img_response):
    """Saves the image content to a temporary file and returns its extension."""
    
    content_type = img_response.headers.get('Content-Type', '').lower()
    ext_match = re.search(r'image/(\w+)', content_type)
    ext = ext_match.group(1) if ext_match else 'jpg'
    
    temp_file = f"temp_download.{ext}"
    
    with open(temp_file, 'wb') as f:
        for chunk in img_response.iter_content(1024):
            f.write(chunk)
    return temp_file, ext

def download_images_batch(image_urls, headers, car_name, global_label, final_dir):
    """Downloads images using the pre-set batch labels and prompts for confirmation."""
    
    print(f"\n--- Batch Settings ---")
    print(f"Car: {car_name} | Label: {global_label} | Destination: {final_dir}")
    print(f"Found {len(image_urls)} potential images. Starting download...")
    
    for i, img_url in enumerate(image_urls):
        try:
            print("-" * 50)
            print(f"Processing Image [{i+1}/{len(image_urls)}]: {img_url}")

            # 1. Download the image content using requests (faster than selenium)
            img_response = requests.get(img_url, headers=headers, stream=True, timeout=10)
            img_response.raise_for_status()

            # 2. Save and display the image
            temp_path, ext = save_temp_image(img_response)
            
            try:
                img = Image.open(temp_path)
                img.show()
                print(f"Image opened. Please confirm it is an EXTERIOR shot of the {car_name}.")
            except Exception as e:
                print(f"Could not open image automatically. Please verify {os.path.abspath(temp_path)} manually.")
            
            # 3. Interactive Confirmation (Crucial for filtering interior/engine shots)
            confirm = input("Keep this image? (Y/N/SKIP): ").upper()
            if confirm in ['N', 'SKIP']:
                print("Skipping and deleting temporary image.")
                os.remove(temp_path)
                time.sleep(1)
                continue

            # 4. Rename and Move
            timestamp = int(time.time() * 1000)
            new_filename = f"{car_name}_{global_label}_{timestamp}.{ext}"
            final_path = os.path.join(final_dir, new_filename)
            
            os.rename(temp_path, final_path)

            print(f"SUCCESS: Saved as '{new_filename}' to {final_dir}")
            
            # 5. Be respectful
            time.sleep(1) 

        except requests.exceptions.RequestException as e:
            print(f"Error downloading {img_url}: {e}")
            if os.path.exists(temp_path): os.remove(temp_path)
            time.sleep(1) 
        except Exception as e:
            print(f"An unexpected error occurred during processing: {e}")
            if os.path.exists(temp_path): os.remove(temp_path)
            time.sleep(1) 
        finally:
            if os.path.exists(TEMP_FILE_PATH):
                os.remove(TEMP_FILE_PATH)


# --- MAIN EXECUTION ---

if __name__ == "__main__":
    
    # --- PHASE 1: User Input and Setup ---
    print("--- Dataset Setup ---")
    
    classification_choice = ""
    while classification_choice not in ['S', 'M']:
        classification_choice = input("Batch Classification: Is this batch [S]tock or [M]odified? ").upper()

    global_label = 'Stock' if classification_choice == 'S' else 'Modified'
    final_dir = STOCK_DIR if classification_choice == 'S' else MODIFIED_DIR
    
    car_make = input("Enter Car Make (e.g., BMW): ").strip()
    car_model = input("Enter Car Model (e.g., 328i): ").strip()
    car_year = input("Enter Car Year (e.g., 2013): ").strip()
    
    car_name = f"{car_make}_{car_model}_{car_year}".replace(' ', '_').replace('-', '_').strip('_')
    if not car_name:
        print("Error: Car details cannot be empty. Exiting.")
        exit()
        
    os.makedirs(final_dir, exist_ok=True)
    
    # --- PHASE 2: Scraping and Downloading ---
    urls_to_scrape = get_image_urls(TARGET_URL)
    
    if urls_to_scrape:
        download_images_batch(urls_to_scrape, HEADERS, car_name, global_label, final_dir)
    else:
        print("No images found. Check your URL, network connection, and Selenium driver setup.")
    
    print("\n" * 3)
    print("=" * 50)
    print(f"✅ Batch Labeling Complete for {car_name} - {global_label}!")
    print(f"Check your labeled data in the '{final_dir}' folder.")
    print("=" * 50)

--- Dataset Setup ---
Fetching HTML (via Selenium) from: https://bringatrailer.com/listing/2013-bmw-328i-sedan-12/
Waiting up to 30 seconds for main page elements...
Main elements loaded. Waiting 10 seconds for images to render...
No images found. Check your URL, network connection, and Selenium driver setup.




✅ Batch Labeling Complete for BMW_328i_2013 - Stock!
Check your labeled data in the 'Car_Images_Dataset\Stock' folder.


In [None]:
import requests
from bs4 import BeautifulSoup
import os
import time
import re
from PIL import Image 
import json
from urllib.parse import urlparse, parse_qs

# --- CONFIGURATION ---
# 1. *** CHANGE THIS TO YOUR TARGET URL (e.g., a specific BaT listing) ***
# Example: 'https://bringatrailer.com/listing/2013-bmw-328i-sedan-12/' 
TARGET_URL = 'https://bringatrailer.com/listing/2013-bmw-328i-sedan-12/' 

# Base directory for Stock/Modified folders
BASE_DIR = 'Car_Images_Dataset' 

# 3. SET A USER-AGENT for direct image downloads
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# Define the two final classification folders
STOCK_DIR = os.path.join(BASE_DIR, 'Stock')
MODIFIED_DIR = os.path.join(BASE_DIR, 'Modified')
TEMP_FILE_PATH = 'temp_download.jpg' 

# --- FUNCTIONS ---

def get_image_urls(url, headers):
    """
    Fetches the HTML using requests and extracts image URLs directly by 
    parsing the 'data-gallery-items' JSON attribute.
    """
    print(f"Fetching HTML (via requests) from: {url}")
    
    try:
        # 1. Fetch the page source
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status() # Raise exception for bad status codes
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # 2. Locate the gallery data DIV
        gallery_div = soup.find('div', {'data-gallery-items': True})
        
        if not gallery_div:
            print("ERROR: Could not find the 'data-gallery-items' div.")
            return []

        # 3. Extract the JSON string
        json_string = gallery_div['data-gallery-items']
        
        # 4. Parse the JSON string into a Python list of dictionaries
        image_data = json.loads(json_string)
        
        image_urls = []
        for item in image_data:
            # We want the 'large' resolution URL
            if 'large' in item and 'url' in item['large']:
                full_url = item['large']['url']
                
                # Clean up the URL: remove 'fit' query parameters that resize the image
                parsed_url = urlparse(full_url)
                # Keep only the path part (the base image file)
                clean_url = parsed_url.scheme + "://" + parsed_url.netloc + parsed_url.path
                
                if clean_url not in image_urls:
                    image_urls.append(clean_url)

        return list(set(image_urls))

    except requests.exceptions.RequestException as e:
        print(f"An error occurred during network request: {e}")
        return []
    except json.JSONDecodeError as e:
        print(f"An error occurred while parsing JSON from 'data-gallery-items': {e}")
        return []
    except Exception as e:
        print(f"An unexpected error occurred during extraction: {e}")
        return []


def save_temp_image(img_response):
    """Saves the image content to a temporary file and returns its extension."""
    
    content_type = img_response.headers.get('Content-Type', '').lower()
    ext_match = re.search(r'image/(\w+)', content_type)
    ext = ext_match.group(1) if ext_match else 'jpg'
    
    temp_file = f"temp_download.{ext}"
    
    with open(temp_file, 'wb') as f:
        # Write chunks to file
        for chunk in img_response.iter_content(1024):
            f.write(chunk)
    return temp_file, ext

def download_images_batch(image_urls, headers, car_name, global_label, final_dir):
    """Downloads images using the pre-set batch labels and prompts for confirmation."""
    
    print(f"\n--- Batch Settings ---")
    print(f"Car: {car_name} | Label: {global_label} | Destination: {final_dir}")
    print(f"Found {len(image_urls)} potential images. Starting download...")
    
    temp_path = TEMP_FILE_PATH # Initialize the temp path variable
    
    for i, img_url in enumerate(image_urls):
        try:
            print("-" * 50)
            print(f"Processing Image [{i+1}/{len(image_urls)}]: {img_url}")

            # 1. Download the image content using requests
            img_response = requests.get(img_url, headers=headers, stream=True, timeout=10)
            img_response.raise_for_status()

            # 2. Save and display the image
            temp_path, ext = save_temp_image(img_response)
            
            try:
                img = Image.open(temp_path)
                img.show()
                print(f"Image opened. Please classify this image as Stock or Modified.")
            except Exception as e:
                print(f"Could not open image automatically. Please verify {os.path.abspath(temp_path)} manually.")
                
            
            # 3. Interactive Confirmation
            while True:
                confirm = input("Keep this image? (Y/N/SKIP): ").upper()
                if confirm in ['Y', 'N', 'SKIP']:
                    break
                print("Invalid input. Please enter Y, N, or SKIP.")

            if confirm in ['N', 'SKIP']:
                print("Skipping and deleting temporary image.")
                os.remove(temp_path)
                time.sleep(0.5)
                continue

            # 4. Rename and Move (using the pre-set batch label for the file name)
            timestamp = int(time.time() * 1000)
            new_filename = f"{car_name}_{global_label}_{timestamp}.{ext}"
            final_path = os.path.join(final_dir, new_filename)
            
            os.rename(temp_path, final_path)

            print(f"SUCCESS: Saved as '{new_filename}' to {final_dir}")
            
            # 5. Be respectful
            time.sleep(0.5) 

        except requests.exceptions.RequestException as e:
            print(f"Error downloading {img_url}: {e}")
            if os.path.exists(temp_path): os.remove(temp_path)
            time.sleep(0.5) 
        except Exception as e:
            print(f"An unexpected error occurred during processing: {e}")
            if os.path.exists(temp_path): os.remove(temp_path)
            time.sleep(0.5) 
        finally:
            # Ensure any temporary files are cleaned up
            if os.path.exists(temp_path) and temp_path.startswith('temp_download.'):
                 os.remove(temp_path)


# --- MAIN EXECUTION ---

if __name__ == "__main__":
    
    # --- PHASE 1: User Input and Setup ---
    print("--- Dataset Setup ---")
    
    classification_choice = ""
    while classification_choice not in ['S', 'M']:
        classification_choice = input("Batch Classification: Is this batch [S]tock or [M]odified? ").upper()

    global_label = 'Stock' if classification_choice == 'S' else 'Modified'
    final_dir = STOCK_DIR if classification_choice == 'S' else MODIFIED_DIR
    
    car_make = input("Enter Car Make (e.g., BMW): ").strip()
    car_model = input("Enter Car Model (e.g., 328i): ").strip()
    car_year = input("Enter Car Year (e.g., 2013): ").strip()
    
    car_name = f"{car_make}_{car_model}_{car_year}".replace(' ', '_').replace('-', '_').strip('_')
    if not car_name:
        print("Error: Car details cannot be empty. Exiting.")
        exit()
        
    os.makedirs(final_dir, exist_ok=True)
    
    # --- PHASE 2: Scraping and Downloading ---
    urls_to_scrape = get_image_urls(TARGET_URL, HEADERS)
    
    if urls_to_scrape:
        download_images_batch(urls_to_scrape, HEADERS, car_name, global_label, final_dir)
    else:
        print("No images found. Check your URL or network connection.")
    
    print("\n" * 3)
    print("=" * 50)
    print(f"✅ Batch Labeling Complete for {car_name} - {global_label}!")
    print(f"Check your labeled data in the '{final_dir}' folder.")
    print("=" * 50)

--- Dataset Setup ---
Fetching HTML (via requests) from: https://bringatrailer.com/listing/2013-bmw-328i-sedan-12/

--- Batch Settings ---
Car: BMW_328i_2013 | Label: Stock | Destination: Car_Images_Dataset\Stock
Found 152 potential images. Starting download...
--------------------------------------------------
Processing Image [1/152]: https://bringatrailer.com/wp-content/uploads/2025/11/2013_bmw_328i-sedan_IMG_2701-34841-scaled.jpg
Image opened. Please classify this image as Stock or Modified.
Skipping and deleting temporary image.
--------------------------------------------------
Processing Image [2/152]: https://bringatrailer.com/wp-content/uploads/2025/11/2013_bmw_328i-sedan_IMG_2794-35446-scaled.jpg
Image opened. Please classify this image as Stock or Modified.
Skipping and deleting temporary image.
--------------------------------------------------
Processing Image [3/152]: https://bringatrailer.com/wp-content/uploads/2025/11/2013_bmw_328i-sedan_IMG_2709-34885-scaled.jpg
Image 

In [None]:
import requests
from bs4 import BeautifulSoup
import os
import time
import re
from PIL import Image 
import json
from urllib.parse import urlparse, parse_qs

# --- CONFIGURATION ---

# 1. *** CHANGE THIS TO YOUR TARGET URL (e.g., a specific BaT listing) ***
# This script is configured to fetch data from this URL.
TARGET_URL = 'https://bringatrailer.com/listing/2013-bmw-328i-sedan-12/' 

# Base directory for the entire dataset
BASE_DIR = 'Car_Images_Dataset' 

# Temporary file path used for downloading and previewing images
TEMP_FILE_PATH = 'temp_download.jpg' 

# 3. SET A USER-AGENT for direct image downloads
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# --- HELPER FUNCTIONS ---

def get_image_urls(url, headers):
    """
    Fetches the HTML and extracts image URLs directly by 
    parsing the 'data-gallery-items' JSON attribute, making it very fast.
    """
    print(f"Fetching HTML (via requests) from: {url}")
    
    try:
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status() 
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Locate the gallery data DIV
        gallery_div = soup.find('div', {'data-gallery-items': True})
        
        if not gallery_div:
            print("ERROR: Could not find the 'data-gallery-items' div.")
            return []

        # Extract and parse the JSON string
        json_string = gallery_div['data-gallery-items']
        image_data = json.loads(json_string)
        
        image_urls = []
        for item in image_data:
            if 'large' in item and 'url' in item['large']:
                full_url = item['large']['url']
                
                # Clean up the URL: remove query parameters to get the full resolution image
                parsed_url = urlparse(full_url)
                clean_url = parsed_url.scheme + "://" + parsed_url.netloc + parsed_url.path
                
                if clean_url not in image_urls:
                    image_urls.append(clean_url)

        return list(set(image_urls))

    except requests.exceptions.RequestException as e:
        print(f"An error occurred during network request: {e}")
        return []
    except json.JSONDecodeError as e:
        print(f"An error occurred while parsing JSON from 'data-gallery-items': {e}")
        return []
    except Exception as e:
        print(f"An unexpected error occurred during extraction: {e}")
        return []

def save_temp_image(img_response):
    """Saves the image content to a temporary file and returns its extension."""
    
    # Determine file extension based on content type
    content_type = img_response.headers.get('Content-Type', '').lower()
    ext_match = re.search(r'image/(\w+)', content_type)
    ext = ext_match.group(1) if ext_match else 'jpg'
    
    temp_file = f"temp_download.{ext}"
    
    with open(temp_file, 'wb') as f:
        # Write chunks to file
        for chunk in img_response.iter_content(1024):
            f.write(chunk)
    return temp_file, ext

def preview_image(img_url, headers):
    """Downloads and displays the first image for preview and asks for confirmation."""
    temp_path = TEMP_FILE_PATH
    try:
        print("\n" + "=" * 50)
        print("--- IMAGE PREVIEW: Verification Step ---")
        print(f"Downloading preview image from: {img_url}")
        
        img_response = requests.get(img_url, headers=headers, stream=True, timeout=10)
        img_response.raise_for_status()

        temp_path, _ = save_temp_image(img_response)
        
        try:
            img = Image.open(temp_path)
            img.show()
            print("The first image is now displayed on your screen.")
        except Exception:
            print(f"Could not open image automatically. Please check {os.path.abspath(temp_path)} manually.")

        while True:
            confirm = input("Does the preview image look correct? Continue with the full batch? (Y/N): ").upper()
            if confirm in ['Y', 'N']:
                break
            print("Invalid input. Please enter Y or N.")

        if os.path.exists(temp_path):
            os.remove(temp_path) # Clean up the preview file

        return confirm == 'Y'

    except requests.exceptions.RequestException as e:
        print(f"Error downloading preview image. Cannot continue: {e}")
        if os.path.exists(temp_path): os.remove(temp_path)
        return False
    except Exception as e:
        print(f"An unexpected error occurred during preview: {e}")
        if os.path.exists(temp_path): os.remove(temp_path)
        return False

def download_images_batch(image_urls, headers, car_name, global_label, final_dir):
    """Downloads images one by one, prompts for confirmation, and saves them to the final directory."""
    
    print(f"\n--- Batch Process Started ---")
    print(f"Car: {car_name} | Batch Label: {global_label} | Destination: {final_dir}")
    print(f"Processing {len(image_urls)} unique images...")
    
    temp_path = TEMP_FILE_PATH 
    
    for i, img_url in enumerate(image_urls):
        try:
            print("-" * 50)
            print(f"Processing Image [{i+1}/{len(image_urls)}]: {os.path.basename(urlparse(img_url).path)}")

            # 1. Download the image content
            img_response = requests.get(img_url, headers=headers, stream=True, timeout=10)
            img_response.raise_for_status()

            # 2. Save and display the image
            temp_path, ext = save_temp_image(img_response)
            
            try:
                img = Image.open(temp_path)
                img.show()
                print(f"Image opened. Please classify this specific image (overrides batch label if necessary).")
            except Exception:
                print(f"Could not open image automatically. Please check {os.path.abspath(temp_path)} manually.")
                
            
            # 3. Interactive Classification and Confirmation
            while True:
                # The user can override the batch classification for this single image
                confirm = input("Keep this image? [Y]es (saves as batch label), [N]o (skip), [M]odified (save as modified), [S]tock (save as stock): ").upper()
                if confirm in ['Y', 'N', 'M', 'S']:
                    break
                print("Invalid input. Please enter Y, N, M, or S.")

            if confirm == 'N':
                print("Skipping and deleting temporary image.")
                os.remove(temp_path)
                time.sleep(0.1)
                continue
            
            # Determine the final label and directory based on user input
            if confirm == 'M':
                current_label = 'Modified'
                current_dir = os.path.join(os.path.dirname(final_dir), current_label)
            elif confirm == 'S':
                current_label = 'Stock'
                current_dir = os.path.join(os.path.dirname(final_dir), current_label)
            else: # 'Y' uses the global batch label
                current_label = global_label
                current_dir = final_dir
                
            # Ensure the specific target directory exists (important if classification is overridden)
            os.makedirs(current_dir, exist_ok=True)


            # 4. Rename and Move
            timestamp = int(time.time() * 1000)
            new_filename = f"{car_name}_{current_label}_{timestamp}.{ext}"
            final_path = os.path.join(current_dir, new_filename)
            
            os.rename(temp_path, final_path)

            print(f"SUCCESS: Saved as '{new_filename}' to {current_dir}")
            
            # 5. Respectful delay
            time.sleep(0.2) 

        except requests.exceptions.RequestException as e:
            print(f"Error downloading {img_url}: {e}")
            if os.path.exists(temp_path): os.remove(temp_path)
            time.sleep(0.2) 
        except Exception as e:
            print(f"An unexpected error occurred during processing: {e}")
            if os.path.exists(temp_path): os.remove(temp_path)
            time.sleep(0.2) 
        finally:
            # Ensure any temporary files are cleaned up
            if os.path.exists(temp_path) and temp_path.startswith('temp_download.'):
                 try:
                     os.remove(temp_path)
                 except OSError:
                     pass # File might have been renamed/moved already


# --- MAIN EXECUTION ---

if __name__ == "__main__":
    
    # --- PHASE 1: User Input and Setup ---
    print("=" * 50)
    print("--- Car Image Scraper & Batch Labeler ---")
    print("=" * 50)
    
    # Get Car Details for Folder Structure
    car_make = input("1. Enter Car Make (e.g., BMW): ").strip()
    car_model = input("2. Enter Car Model (e.g., 328i): ").strip()
    car_year = input("3. Enter Car Year (e.g., 2013): ").strip()
    
    car_make_folder = car_make.replace(' ', '_').replace('-', '_').strip('_').upper()
    car_name = f"{car_make}_{car_model}_{car_year}".replace(' ', '_').replace('-', '_').strip('_')
    
    if not car_name or not car_make_folder:
        print("Error: Car details cannot be empty. Exiting.")
        exit()

    # Define Dynamic Paths
    MAKE_DIR = os.path.join(BASE_DIR, car_make_folder)
    STOCK_DIR_FINAL = os.path.join(MAKE_DIR, 'Stock')
    MODIFIED_DIR_FINAL = os.path.join(MAKE_DIR, 'Modified')

    # Get Batch Classification
    classification_choice = ""
    while classification_choice not in ['S', 'M']:
        classification_choice = input("\n4. Primary Batch Classification: Is this batch primarily [S]tock or [M]odified? ").upper()

    global_label = 'Stock' if classification_choice == 'S' else 'Modified'
    final_dir = STOCK_DIR_FINAL if classification_choice == 'S' else MODIFIED_DIR_FINAL
    
    # Create the target directory structure
    os.makedirs(final_dir, exist_ok=True)
    
    print("-" * 50)
    print(f"Output folder structure prepared: {final_dir}")
    
    # --- PHASE 2: Scraping and Preview ---
    urls_to_scrape = get_image_urls(TARGET_URL, HEADERS)
    
    if urls_to_scrape:
        first_url = urls_to_scrape[0]
        
        # New: Preview the first image and ask for confirmation
        if preview_image(first_url, HEADERS):
            # --- PHASE 3: Download and Labeling ---
            download_images_batch(urls_to_scrape, HEADERS, car_name, global_label, final_dir)
        else:
            print("Operation canceled by user based on the preview. Exiting.")
    else:
        print("No image URLs were successfully extracted. Check the TARGET_URL setting.")
    
    print("\n" * 2)
    print("=" * 50)
    print(f"✅ Interactive Batch Labeling Session Concluded.")
    print(f"Data saved to the '{BASE_DIR}' folder.")
    print("=" * 50)

--- Car Image Scraper & Batch Labeler ---
--------------------------------------------------
Output folder structure prepared: Car_Images_Dataset\BMW\Stock
Fetching HTML (via requests) from: https://bringatrailer.com/listing/2013-bmw-328i-sedan-12/

--- IMAGE PREVIEW: Verification Step ---
Downloading preview image from: https://bringatrailer.com/wp-content/uploads/2025/11/2013_bmw_328i-sedan_IMG_7842-35665-scaled.jpg
The first image is now displayed on your screen.

--- Batch Process Started ---
Car: BMW_328i_2013 | Batch Label: Stock | Destination: Car_Images_Dataset\BMW\Stock
Processing 152 unique images...
--------------------------------------------------
Processing Image [1/152]: 2013_bmw_328i-sedan_IMG_7842-35665-scaled.jpg
Image opened. Please classify this specific image (overrides batch label if necessary).
SUCCESS: Saved as 'BMW_328i_2013_Stock_1763817088598.jpeg' to Car_Images_Dataset\BMW\Stock
--------------------------------------------------
Processing Image [2/152]: 201

In [1]:
import requests
from bs4 import BeautifulSoup
import os
import time
import re
from PIL import Image 
import json
from urllib.parse import urlparse, parse_qs

# --- CONFIGURATION ---

# 1. *** CHANGE THIS TO YOUR TARGET URL (e.g., a specific BaT listing) ***
# This script is configured to fetch data from this URL.
TARGET_URL = 'https://bringatrailer.com/listing/2013-bmw-328i-sedan-12/' 

# Base directory for the entire dataset
BASE_DIR = 'Car_Images_Dataset' 

# 3. SET A USER-AGENT for direct image downloads
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# --- HELPER FUNCTIONS ---

def get_image_urls(url, headers):
    """
    Fetches the HTML and extracts image URLs directly by 
    parsing the 'data-gallery-items' JSON attribute, making it very fast.
    """
    print(f"Fetching HTML (via requests) from: {url}")
    
    try:
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status() 
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Locate the gallery data DIV
        gallery_div = soup.find('div', {'data-gallery-items': True})
        
        if not gallery_div:
            print("ERROR: Could not find the 'data-gallery-items' div.")
            return []

        # Extract and parse the JSON string
        json_string = gallery_div['data-gallery-items']
        image_data = json.loads(json_string)
        
        image_urls = []
        for item in image_data:
            if 'large' in item and 'url' in item['large']:
                full_url = item['large']['url']
                
                # Clean up the URL: remove query parameters to get the full resolution image
                parsed_url = urlparse(full_url)
                clean_url = parsed_url.scheme + "://" + parsed_url.netloc + parsed_url.path
                
                if clean_url not in image_urls:
                    image_urls.append(clean_url)

        return list(set(image_urls))

    except requests.exceptions.RequestException as e:
        print(f"An error occurred during network request: {e}")
        return []
    except json.JSONDecodeError as e:
        print(f"An error occurred while parsing JSON from 'data-gallery-items': {e}")
        return []
    except Exception as e:
        print(f"An unexpected error occurred during extraction: {e}")
        return []

def save_temp_image(img_response):
    """Saves the image content to a temporary file (temp_download.ext) and returns its path and extension."""
    
    # Determine file extension based on content type
    content_type = img_response.headers.get('Content-Type', '').lower()
    ext_match = re.search(r'image/(\w+)', content_type)
    ext = ext_match.group(1) if ext_match else 'jpg'
    
    temp_file = f"temp_download.{ext}"
    
    with open(temp_file, 'wb') as f:
        # Write chunks to file
        for chunk in img_response.iter_content(1024):
            f.write(chunk)
    return temp_file, ext

def preview_images(image_urls, headers, count=3):
    """Downloads and displays the first 'count' images for preview and asks for confirmation."""
    
    urls_to_preview = image_urls[:count]
    
    print("\n" + "=" * 50)
    print(f"--- IMAGE PREVIEW: Verification Step (Showing first {len(urls_to_preview)} images) ---")
    
    all_previews_successful = True
    temp_files = []

    for i, img_url in enumerate(urls_to_preview):
        # Use a unique name for each preview image
        current_temp_file_base = f"temp_preview_{i}"

        try:
            print(f"Downloading preview image {i+1}/{len(urls_to_preview)} from: {img_url}")
            
            img_response = requests.get(img_url, headers=headers, stream=True, timeout=10)
            img_response.raise_for_status()

            # Determine file extension
            content_type = img_response.headers.get('Content-Type', '').lower()
            ext_match = re.search(r'image/(\w+)', content_type)
            ext = ext_match.group(1) if ext_match else 'jpg'
            
            current_temp_file = f"{current_temp_file_base}.{ext}"
            temp_files.append(current_temp_file)
            
            # Save to temporary file
            with open(current_temp_file, 'wb') as f:
                for chunk in img_response.iter_content(1024):
                    f.write(chunk)
            
            # Display the image
            try:
                img = Image.open(current_temp_file)
                img.show()
                print(f"Image {i+1} is now displayed on your screen.")
            except Exception:
                print(f"Could not open image {i+1} automatically. Please check {os.path.abspath(current_temp_file)} manually.")

            time.sleep(0.5) # Small delay to separate previews
            
        except requests.exceptions.RequestException as e:
            print(f"Error downloading preview image {i+1}. Skipping: {e}")
            all_previews_successful = False
            
        except Exception as e:
            print(f"An unexpected error occurred during preview {i+1}. Skipping: {e}")
            all_previews_successful = False
            
    # Final Confirmation
    print("\n" + "=" * 50)
    print("Verification Complete.")
    
    if not all_previews_successful and urls_to_preview:
        print("WARNING: Some preview images failed to download. Proceed with caution.")
        
    while True:
        confirm = input("Based on the 3 preview images, does the data look correct? Continue with the full batch? (Y/N): ").upper()
        if confirm in ['Y', 'N']:
            break
        print("Invalid input. Please enter Y or N.")

    # Clean up all temporary preview files
    for temp_file in temp_files:
        if os.path.exists(temp_file):
            try:
                os.remove(temp_file) 
            except OSError:
                pass

    return confirm == 'Y'

def download_images_batch(image_urls, headers, car_name, global_label, final_dir):
    """Downloads images one by one, prompts for confirmation, and saves them to the final directory."""
    
    print(f"\n--- Batch Process Started ---")
    print(f"Car: {car_name} | Batch Label: {global_label} | Destination: {final_dir}")
    print(f"Processing {len(image_urls)} unique images...")
    
    temp_path = '' # Initialize temp_path
    
    for i, img_url in enumerate(image_urls):
        try:
            print("-" * 50)
            print(f"Processing Image [{i+1}/{len(image_urls)}]: {os.path.basename(urlparse(img_url).path)}")

            # 1. Download the image content
            img_response = requests.get(img_url, headers=headers, stream=True, timeout=10)
            img_response.raise_for_status()

            # 2. Save and display the image
            # save_temp_image returns a path like 'temp_download.jpg'
            temp_path, ext = save_temp_image(img_response) 
            
            try:
                img = Image.open(temp_path)
                img.show()
                print(f"Image opened. Please classify this specific image (overrides batch label if necessary).")
            except Exception:
                print(f"Could not open image automatically. Please check {os.path.abspath(temp_path)} manually.")
                
            
            # 3. Interactive Classification and Confirmation
            while True:
                # The user can override the batch classification for this single image
                confirm = input("Keep this image? [Y]es (saves as batch label), [N]o (skip), [M]odified (save as modified), [S]tock (save as stock): ").upper()
                if confirm in ['Y', 'N', 'M', 'S']:
                    break
                print("Invalid input. Please enter Y, N, M, or S.")

            if confirm == 'N':
                print("Skipping and deleting temporary image.")
                os.remove(temp_path)
                time.sleep(0.1)
                continue
            
            # Determine the final label and directory based on user input
            if confirm == 'M':
                current_label = 'Modified'
                # Path is updated to the sibling 'Modified' directory
                current_dir = os.path.join(os.path.dirname(final_dir), current_label)
            elif confirm == 'S':
                current_label = 'Stock'
                # Path is updated to the sibling 'Stock' directory
                current_dir = os.path.join(os.path.dirname(final_dir), current_label)
            else: # 'Y' uses the global batch label
                current_label = global_label
                current_dir = final_dir
                
            # Ensure the specific target directory exists (important if classification is overridden)
            os.makedirs(current_dir, exist_ok=True)


            # 4. Rename and Move
            timestamp = int(time.time() * 1000)
            new_filename = f"{car_name}_{current_label}_{timestamp}.{ext}"
            final_path = os.path.join(current_dir, new_filename)
            
            os.rename(temp_path, final_path)

            print(f"SUCCESS: Saved as '{new_filename}' to {current_dir}")
            
            # 5. Respectful delay
            time.sleep(0.2) 

        except requests.exceptions.RequestException as e:
            print(f"Error downloading {img_url}: {e}")
            if os.path.exists(temp_path): os.remove(temp_path)
            time.sleep(0.2) 
        except Exception as e:
            print(f"An unexpected error occurred during processing: {e}")
            if os.path.exists(temp_path): os.remove(temp_path)
            time.sleep(0.2) 
        finally:
            # Ensure any temporary files are cleaned up if they still exist under the default temp name
            if os.path.exists(temp_path) and temp_path.startswith('temp_download.'):
                 try:
                     os.remove(temp_path)
                 except OSError:
                     pass # File might have been renamed/moved already


# --- MAIN EXECUTION ---

if __name__ == "__main__":
    
    # --- PHASE 1: User Input and Setup ---
    print("=" * 50)
    print("--- Car Image Scraper & Batch Labeler ---")
    print("=" * 50)
    
    # Get Car Details for Folder Structure
    car_make = input("1. Enter Car Make (e.g., BMW): ").strip()
    car_model = input("2. Enter Car Model (e.g., 328i): ").strip()
    car_year = input("3. Enter Car Year (e.g., 2013): ").strip()
    
    # Create clean, uppercase folder name for the make
    car_make_folder = car_make.replace(' ', '_').replace('-', '_').strip('_').upper()
    car_name = f"{car_make}_{car_model}_{car_year}".replace(' ', '_').replace('-', '_').strip('_')
    
    if not car_name or not car_make_folder:
        print("Error: Car details cannot be empty. Exiting.")
        exit()

    # Define Dynamic Paths
    MAKE_DIR = os.path.join(BASE_DIR, car_make_folder)
    STOCK_DIR_FINAL = os.path.join(MAKE_DIR, 'Stock')
    MODIFIED_DIR_FINAL = os.path.join(MAKE_DIR, 'Modified')

    # Get Batch Classification
    classification_choice = ""
    while classification_choice not in ['S', 'M']:
        classification_choice = input("\n4. Primary Batch Classification: Is this batch primarily [S]tock or [M]odified? ").upper()

    global_label = 'Stock' if classification_choice == 'S' else 'Modified'
    final_dir = STOCK_DIR_FINAL if classification_choice == 'S' else MODIFIED_DIR_FINAL
    
    # Create the target directory structure
    os.makedirs(final_dir, exist_ok=True)
    
    print("-" * 50)
    print(f"Output folder structure prepared: {final_dir}")
    
    # --- PHASE 2: Scraping and Preview ---
    urls_to_scrape = get_image_urls(TARGET_URL, HEADERS)
    
    if urls_to_scrape:
        # Check how many images we can preview (up to 3)
        preview_count = min(3, len(urls_to_scrape))
        
        # Call the new multiple-image preview function
        if preview_images(urls_to_scrape, HEADERS, preview_count):
            # --- PHASE 3: Download and Labeling ---
            download_images_batch(urls_to_scrape, HEADERS, car_name, global_label, final_dir)
        else:
            print("Operation canceled by user based on the preview. Exiting.")
    else:
        print("No image URLs were successfully extracted. Check the TARGET_URL setting.")
    
    print("\n" * 2)
    print("=" * 50)
    print(f"✅ Interactive Batch Labeling Session Concluded.")
    print(f"Data saved to the '{BASE_DIR}' folder.")
    print("=" * 50)

--- Car Image Scraper & Batch Labeler ---
Error: Car details cannot be empty. Exiting.
--------------------------------------------------
Output folder structure prepared: Car_Images_Dataset\Stock
Fetching HTML (via requests) from: https://bringatrailer.com/listing/2013-bmw-328i-sedan-12/

--- IMAGE PREVIEW: Verification Step (Showing first 3 images) ---
Downloading preview image 1/3 from: https://bringatrailer.com/wp-content/uploads/2025/11/2013_bmw_328i-sedan_IMG_2750-35177-scaled.jpg
Image 1 is now displayed on your screen.
Downloading preview image 2/3 from: https://bringatrailer.com/wp-content/uploads/2025/11/2013_bmw_328i-sedan_IMG_2741-35122-scaled.jpg
Image 2 is now displayed on your screen.
Downloading preview image 3/3 from: https://bringatrailer.com/wp-content/uploads/2025/11/2013_bmw_328i-sedan_IMG_2692-34792-scaled.jpg
Image 3 is now displayed on your screen.

Verification Complete.
Invalid input. Please enter Y or N.

--- Batch Process Started ---
Car: BMW_328i | Batch La

: 

In [None]:
import requests
from bs4 import BeautifulSoup
import os
import time
import re
from PIL import Image 
import json
from urllib.parse import urlparse, parse_qs

# --- CONFIGURATION ---

# 1. *** CHANGE THIS TO YOUR TARGET URL (e.g., a specific BaT listing) ***
# This script is configured to fetch data from this URL.
TARGET_URL = 'https://bringatrailer.com/listing/2013-bmw-328i-sedan-12/' 

# Base directory for the entire dataset
BASE_DIR = 'Car_Images_Dataset' 

# 3. SET A USER-AGENT for direct image downloads
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# --- HELPER FUNCTIONS ---

def get_image_urls(url, headers):
    """
    Fetches the HTML and extracts image URLs directly by 
    parsing the 'data-gallery-items' JSON attribute, making it very fast.
    """
    print(f"Fetching HTML (via requests) from: {url}")
    
    try:
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status() 
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Locate the gallery data DIV
        gallery_div = soup.find('div', {'data-gallery-items': True})
        
        if not gallery_div:
            print("ERROR: Could not find the 'data-gallery-items' div.")
            return []

        # Extract and parse the JSON string
        json_string = gallery_div['data-gallery-items']
        image_data = json.loads(json_string)
        
        image_urls = []
        for item in image_data:
            if 'large' in item and 'url' in item['large']:
                full_url = item['large']['url']
                
                # Clean up the URL: remove query parameters to get the full resolution image
                parsed_url = urlparse(full_url)
                clean_url = parsed_url.scheme + "://" + parsed_url.netloc + parsed_url.path
                
                if clean_url not in image_urls:
                    image_urls.append(clean_url)

        return list(set(image_urls))

    except requests.exceptions.RequestException as e:
        print(f"An error occurred during network request: {e}")
        return []
    except json.JSONDecodeError as e:
        print(f"An error occurred while parsing JSON from 'data-gallery-items': {e}")
        return []
    except Exception as e:
        print(f"An unexpected error occurred during extraction: {e}")
        return []

def save_temp_image(img_response):
    """Saves the image content to a temporary file (temp_download.ext) and returns its path and extension."""
    
    # Determine file extension based on content type
    content_type = img_response.headers.get('Content-Type', '').lower()
    ext_match = re.search(r'image/(\w+)', content_type)
    ext = ext_match.group(1) if ext_match else 'jpg'
    
    temp_file = f"temp_download.{ext}"
    
    with open(temp_file, 'wb') as f:
        # Write chunks to file
        for chunk in img_response.iter_content(1024):
            f.write(chunk)
    return temp_file, ext

def preview_images(image_urls, headers, count=3):
    """Downloads and displays the first 'count' images for preview and asks for confirmation."""
    
    urls_to_preview = image_urls[:count]
    
    print("\n" + "=" * 50)
    print(f"--- IMAGE PREVIEW: Verification Step (Showing first {len(urls_to_preview)} images) ---")
    
    all_previews_successful = True
    temp_files = []

    for i, img_url in enumerate(urls_to_preview):
        # Use a unique name for each preview image
        current_temp_file_base = f"temp_preview_{i}"

        try:
            print(f"Downloading preview image {i+1}/{len(urls_to_preview)} from: {img_url}")
            
            img_response = requests.get(img_url, headers=headers, stream=True, timeout=10)
            img_response.raise_for_status()

            # Determine file extension
            content_type = img_response.headers.get('Content-Type', '').lower()
            ext_match = re.search(r'image/(\w+)', content_type)
            ext = ext_match.group(1) if ext_match else 'jpg'
            
            current_temp_file = f"{current_temp_file_base}.{ext}"
            temp_files.append(current_temp_file)
            
            # Save to temporary file
            with open(current_temp_file, 'wb') as f:
                for chunk in img_response.iter_content(1024):
                    f.write(chunk)
            
            # Display the image
            try:
                img = Image.open(current_temp_file)
                img.show()
                print(f"Image {i+1} is now displayed on your screen.")
            except Exception:
                print(f"Could not open image {i+1} automatically. Please check {os.path.abspath(current_temp_file)} manually.")

            time.sleep(0.5) # Small delay to separate previews
            
        except requests.exceptions.RequestException as e:
            print(f"Error downloading preview image {i+1}. Skipping: {e}")
            all_previews_successful = False
            
        except Exception as e:
            print(f"An unexpected error occurred during preview {i+1}. Skipping: {e}")
            all_previews_successful = False
            
    # Final Confirmation
    print("\n" + "=" * 50)
    print("Verification Complete.")
    
    if not all_previews_successful and urls_to_preview:
        print("WARNING: Some preview images failed to download. Proceed with caution.")
        
    while True:
        confirm = input("Based on the 3 preview images, does the data look correct? Continue with the full batch? (Y/N): ").upper()
        if confirm in ['Y', 'N']:
            break
        print("Invalid input. Please enter Y or N.")

    # Clean up all temporary preview files
    for temp_file in temp_files:
        if os.path.exists(temp_file):
            try:
                os.remove(temp_file) 
            except OSError:
                pass

    return confirm == 'Y'

def download_images_batch(image_urls, headers, car_name, global_label, final_dir):
    """Downloads all images automatically and saves them using the global batch label."""
    
    print(f"\n--- Batch Process Started (Automatic Download) ---")
    print(f"Car: {car_name} | Label Applied to All: {global_label} | Destination: {final_dir}")
    print(f"Processing {len(image_urls)} unique images...")
    
    temp_path = '' 
    
    # Ensure the final directory exists (created in main, but good to double-check)
    os.makedirs(final_dir, exist_ok=True)
    
    for i, img_url in enumerate(image_urls):
        try:
            print("-" * 50)
            print(f"Downloading Image [{i+1}/{len(image_urls)}]: {os.path.basename(urlparse(img_url).path)}")

            # 1. Download the image content
            img_response = requests.get(img_url, headers=headers, stream=True, timeout=10)
            img_response.raise_for_status()

            # 2. Save the image to a temporary file
            temp_path, ext = save_temp_image(img_response) 
            
            # Use the global batch label and directory for all images (non-interactive)
            current_label = global_label
            current_dir = final_dir
            
            # 3. Rename and Move
            timestamp = int(time.time() * 1000)
            new_filename = f"{car_name}_{current_label}_{timestamp}.{ext}"
            final_path = os.path.join(current_dir, new_filename)
            
            os.rename(temp_path, final_path)

            print(f"SUCCESS: Saved as '{new_filename}' to {current_dir}")
            
            # 4. Respectful delay
            time.sleep(0.2) 

        except requests.exceptions.RequestException as e:
            print(f"Error downloading {img_url}: {e}")
            if os.path.exists(temp_path): os.remove(temp_path)
            time.sleep(0.2) 
        except Exception as e:
            print(f"An unexpected error occurred during processing: {e}")
            if os.path.exists(temp_path): os.remove(temp_path)
            time.sleep(0.2) 
        finally:
            # Ensure any temporary files are cleaned up if they still exist under the default temp name
            if os.path.exists(temp_path) and temp_path.startswith('temp_download.'):
                 try:
                     os.remove(temp_path)
                 except OSError:
                     pass


# --- MAIN EXECUTION ---

if __name__ == "__main__":
    
    # --- PHASE 1: User Input and Setup ---
    print("=" * 50)
    print("--- Car Image Scraper & Batch Labeler ---")
    print("=" * 50)
    
    # Get Car Details for Folder Structure
    car_make = input("1. Enter Car Make (e.g., BMW): ").strip()
    car_model = input("2. Enter Car Model (e.g., 328i): ").strip()
    car_year = input("3. Enter Car Year (e.g., 2013): ").strip()
    
    # Create clean, uppercase folder name for the make
    car_make_folder = car_make.replace(' ', '_').replace('-', '_').strip('_').upper()
    car_name = f"{car_make}_{car_model}_{car_year}".replace(' ', '_').replace('-', '_').strip('_')
    
    if not car_name or not car_make_folder:
        print("Error: Car details cannot be empty. Exiting.")
        exit()

    # Define Dynamic Paths
    MAKE_DIR = os.path.join(BASE_DIR, car_make_folder)
    STOCK_DIR_FINAL = os.path.join(MAKE_DIR, 'Stock')
    MODIFIED_DIR_FINAL = os.path.join(MAKE_DIR, 'Modified')

    # Get Batch Classification
    classification_choice = ""
    while classification_choice not in ['S', 'M']:
        classification_choice = input("\n4. Primary Batch Classification: Is this batch primarily [S]tock or [M]odified? ").upper()

    global_label = 'Stock' if classification_choice == 'S' else 'Modified'
    final_dir = STOCK_DIR_FINAL if classification_choice == 'S' else MODIFIED_DIR_FINAL
    
    # Create the target directory structure
    os.makedirs(final_dir, exist_ok=True)
    
    print("-" * 50)
    print(f"Output folder structure prepared: {final_dir}")
    
    # --- PHASE 2: Scraping and Preview ---
    urls_to_scrape = get_image_urls(TARGET_URL, HEADERS)
    
    if urls_to_scrape:
        # Check how many images we can preview (up to 3)
        preview_count = min(3, len(urls_to_scrape))
        
        # Call the new multiple-image preview function
        if preview_images(urls_to_scrape, HEADERS, preview_count):
            # --- PHASE 3: Download and Labeling (Non-Interactive) ---
            download_images_batch(urls_to_scrape, HEADERS, car_name, global_label, final_dir)
        else:
            print("Operation canceled by user based on the preview. Exiting.")
    else:
        print("No image URLs were successfully extracted. Check the TARGET_URL setting.")
    
    print("\n" * 2)
    print("=" * 50)
    print(f"✅ Interactive Batch Labeling Session Concluded.")
    print(f"Data saved to the '{BASE_DIR}' folder.")
    print("=" * 50)

--- Car Image Scraper & Batch Labeler ---
Error: Car details cannot be empty. Exiting.
--------------------------------------------------
Output folder structure prepared: Car_Images_Dataset\Stock
Fetching HTML (via requests) from: https://bringatrailer.com/listing/2013-bmw-328i-sedan-12/

--- IMAGE PREVIEW: Verification Step (Showing first 3 images) ---
Downloading preview image 1/3 from: https://bringatrailer.com/wp-content/uploads/2025/11/2013_bmw_328i-sedan_IMG_2670-34046-scaled.jpg
Image 1 is now displayed on your screen.
Downloading preview image 2/3 from: https://bringatrailer.com/wp-content/uploads/2025/11/2013_bmw_328i-sedan_IMG_2717-34954-scaled.jpg
Image 2 is now displayed on your screen.
Downloading preview image 3/3 from: https://bringatrailer.com/wp-content/uploads/2025/11/2013_bmw_328i-sedan_IMG_2712-34908-scaled.jpg
Image 3 is now displayed on your screen.

Verification Complete.
Invalid input. Please enter Y or N.

--- Batch Process Started (Automatic Download) ---
Car

: 

In [None]:
import requests # Used for making HTTP requests to download web pages and images
from bs4 import BeautifulSoup # Used for parsing HTML content
import os # Used for interacting with the operating system (creating directories, moving files)
import time # Used to introduce delays to be respectful to the server
import re # Used for regular expressions, specifically to extract file extensions
from PIL import Image # Pillow library, used for opening and displaying images locally
import json # Used for parsing JSON data embedded in the HTML
from urllib.parse import urlparse, parse_qs # Used for parsing and manipulating URLs

# --- CONFIGURATION ---

# 1. *** CHANGE THIS TO YOUR TARGET URL (e.g., a specific BaT listing) ***
# The script will scrape this URL for high-resolution images.
TARGET_URL = 'https://bringatrailer.com/listing/2013-bmw-328i-sedan-12/' 

# Base directory where all car make folders will be created.
BASE_DIR = 'Car_Images_Dataset' 

# 3. SET A USER-AGENT for direct image downloads
# This mimics a standard web browser request, which helps prevent blocks
# from the server that might deny requests from unidentified scripts.
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# --- HELPER FUNCTIONS ---

def get_image_urls(url, headers):
    """
    Fetches the HTML from the target URL and extracts all high-resolution 
    image URLs by parsing the embedded 'data-gallery-items' JSON attribute.
    
    :param url: The URL of the car listing to scrape.
    :param headers: The HTTP headers including the User-Agent.
    :return: A list of unique, cleaned image URLs.
    """
    print(f"Fetching HTML (via requests) from: {url}")
    
    try:
        # Send a GET request to the target URL
        response = requests.get(url, headers=headers, timeout=15)
        # Check for HTTP errors (4xx or 5xx)
        response.raise_for_status() 
        
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Locate the specific div tag that contains the image gallery data as a JSON string
        gallery_div = soup.find('div', {'data-gallery-items': True})
        
        if not gallery_div:
            print("ERROR: Could not find the 'data-gallery-items' div. Scraper logic may need updating.")
            return []

        # Extract the JSON string from the attribute
        json_string = gallery_div['data-gallery-items']
        # Convert the JSON string into a Python list/dictionary
        image_data = json.loads(json_string)
        
        image_urls = []
        for item in image_data:
            # Check if the 'large' image URL is available
            if 'large' in item and 'url' in item['large']:
                full_url = item['large']['url']
                
                # The URL often contains query parameters (e.g., ?fit=crop) that reduce resolution.
                # This section cleans the URL to get the base image path for maximum resolution.
                parsed_url = urlparse(full_url)
                clean_url = parsed_url.scheme + "://" + parsed_url.netloc + parsed_url.path
                
                if clean_url not in image_urls:
                    image_urls.append(clean_url)

        return list(set(image_urls)) # Return only unique URLs

    except requests.exceptions.RequestException as e:
        print(f"An error occurred during network request: {e}")
        return []
    except json.JSONDecodeError as e:
        print(f"An error occurred while parsing JSON from 'data-gallery-items': {e}")
        return []
    except Exception as e:
        print(f"An unexpected error occurred during extraction: {e}")
        return []

def save_temp_image(img_response):
    """
    Saves the image content from a requests response to a temporary file 
    (named 'temp_download.<ext>') and determines the file extension.

    :param img_response: The requests response object containing image data.
    :return: A tuple of (temporary file path, file extension).
    """
    
    # Check the Content-Type header to determine the correct file extension
    content_type = img_response.headers.get('Content-Type', '').lower()
    ext_match = re.search(r'image/(\w+)', content_type)
    ext = ext_match.group(1) if ext_match else 'jpg' # Default to jpg if type is unknown
    
    temp_file = f"temp_download.{ext}"
    
    # Open the temporary file in binary write mode
    with open(temp_file, 'wb') as f:
        # Write chunks of data to the file to handle large images efficiently
        for chunk in img_response.iter_content(1024):
            f.write(chunk)
    return temp_file, ext

def preview_images(image_urls, headers, count=3):
    """
    Downloads and displays the first 'count' images for visual verification 
    before starting the full batch download.
    
    :param image_urls: List of all extracted image URLs.
    :param headers: The HTTP headers.
    :param count: The number of images to preview (default is 3).
    :return: Boolean confirmation (True to proceed, False to cancel).
    """
    
    # Select only the first 'count' images for preview
    urls_to_preview = image_urls[:count]
    
    print("\n" + "=" * 50)
    print(f"--- IMAGE PREVIEW: Verification Step (Showing first {len(urls_to_preview)} images) ---")
    
    all_previews_successful = True
    temp_files = [] # List to track temporary files for cleanup

    for i, img_url in enumerate(urls_to_preview):
        # Create a unique temporary filename for each preview image
        current_temp_file_base = f"temp_preview_{i}"

        try:
            print(f"Downloading preview image {i+1}/{len(urls_to_preview)} from: {img_url}")
            
            # Download the image data
            img_response = requests.get(img_url, headers=headers, stream=True, timeout=10)
            img_response.raise_for_status()

            # Determine extension and path
            content_type = img_response.headers.get('Content-Type', '').lower()
            ext_match = re.search(r'image/(\w+)', content_type)
            ext = ext_match.group(1) if ext_match else 'jpg'
            current_temp_file = f"{current_temp_file_base}.{ext}"
            temp_files.append(current_temp_file)
            
            # Save the image to the unique temporary file
            with open(current_temp_file, 'wb') as f:
                for chunk in img_response.iter_content(1024):
                    f.write(chunk)
            
            # Use Pillow (PIL) to display the image for the user
            try:
                img = Image.open(current_temp_file)
                img.show()
                print(f"Image {i+1} is now displayed on your screen.")
            except Exception:
                # Fallback if automatic display fails
                print(f"Could not open image {i+1} automatically. Please check {os.path.abspath(current_temp_file)} manually.")

            time.sleep(0.5) # Wait half a second before trying to show the next one
            
        except requests.exceptions.RequestException as e:
            print(f"Error downloading preview image {i+1}. Skipping: {e}")
            all_previews_successful = False
            
        except Exception as e:
            print(f"An unexpected error occurred during preview {i+1}. Skipping: {e}")
            all_previews_successful = False
            
    # Final Confirmation Prompt
    print("\n" + "=" * 50)
    print("Verification Complete.")
    
    if not all_previews_successful and urls_to_preview:
        print("WARNING: Some preview images failed to download. Proceed with caution.")
        
    while True:
        confirm = input("Based on the 3 preview images, does the data look correct? Continue with the full batch? (Y/N): ").upper()
        if confirm in ['Y', 'N']:
            break
        print("Invalid input. Please enter Y or N.")

    # Clean up: Delete all temporary preview files
    for temp_file in temp_files:
        if os.path.exists(temp_file):
            try:
                os.remove(temp_file) 
            except OSError:
                pass

    return confirm == 'Y'

def download_images_batch(image_urls, headers, car_name, global_label, final_dir):
    """
    Downloads all images automatically and saves them using the global batch label.
    This function is non-interactive after the verification phase.

    :param image_urls: List of all image URLs to download.
    :param headers: The HTTP headers.
    :param car_name: Formatted string of the car (e.g., BMW_328i_2013).
    :param global_label: The primary classification ('Stock' or 'Modified') for all files.
    :param final_dir: The full path to the final classification folder.
    """
    
    print(f"\n--- Batch Process Started (Automatic Download) ---")
    print(f"Car: {car_name} | Label Applied to All: {global_label} | Destination: {final_dir}")
    print(f"Processing {len(image_urls)} unique images...")
    
    temp_path = '' # Placeholder for the temporary file path
    
    # Ensure the final directory exists (created in main, but good to double-check)
    os.makedirs(final_dir, exist_ok=True)
    
    for i, img_url in enumerate(image_urls):
        try:
            print("-" * 50)
            print(f"Downloading Image [{i+1}/{len(image_urls)}]: {os.path.basename(urlparse(img_url).path)}")

            # 1. Download the image content
            img_response = requests.get(img_url, headers=headers, stream=True, timeout=10)
            img_response.raise_for_status()

            # 2. Save the image to a temporary file and get its extension
            temp_path, ext = save_temp_image(img_response) 
            
            # The label and directory are fixed based on the initial user input (non-interactive)
            current_label = global_label
            current_dir = final_dir
            
            # 3. Rename and Move
            timestamp = int(time.time() * 1000) # Use a timestamp to ensure unique filenames
            # Format: CarName_Label_Timestamp.ext
            new_filename = f"{car_name}_{current_label}_{timestamp}.{ext}"
            final_path = os.path.join(current_dir, new_filename)
            
            # Atomically rename/move the temporary file to its final, permanent location
            os.rename(temp_path, final_path)

            print(f"SUCCESS: Saved as '{new_filename}' to {current_dir}")
            
            # 4. Respectful delay
            time.sleep(0.2) 

        except requests.exceptions.RequestException as e:
            print(f"Error downloading {img_url}: {e}")
            if os.path.exists(temp_path): os.remove(temp_path)
            time.sleep(0.2) 
        except Exception as e:
            print(f"An unexpected error occurred during processing: {e}")
            if os.path.exists(temp_path): os.remove(temp_path)
            time.sleep(0.2) 
        finally:
            # Cleanup check for the general 'temp_download' file 
            # (only if an error occurred before rename)
            if os.path.exists(temp_path) and temp_path.startswith('temp_download.'):
                 try:
                     os.remove(temp_path)
                 except OSError:
                     pass


# --- MAIN EXECUTION ---

if __name__ == "__main__":
    
    # --- PHASE 1: User Input and Setup ---
    print("=" * 50)
    print("--- Car Image Scraper & Batch Labeler ---")
    print("=" * 50)
    
    # 1. Get Car Details for Folder Structure
    car_make = input("1. Enter Car Make (e.g., BMW): ").strip()
    car_model = input("2. Enter Car Model (e.g., 328i): ").strip()
    car_year = input("3. Enter Car Year (e.g., 2013): ").strip()
    
    # Format the input for safe file and folder naming
    car_make_folder = car_make.replace(' ', '_').replace('-', '_').strip('_').upper()
    car_name = f"{car_make}_{car_model}_{car_year}".replace(' ', '_').replace('-', '_').strip('_')
    
    if not car_name or not car_make_folder:
        print("Error: Car details cannot be empty. Exiting.")
        exit()

    # 2. Define Dynamic Paths based on user input
    MAKE_DIR = os.path.join(BASE_DIR, car_make_folder)
    STOCK_DIR_FINAL = os.path.join(MAKE_DIR, 'Stock')
    MODIFIED_DIR_FINAL = os.path.join(MAKE_DIR, 'Modified')

    # 3. Get Batch Classification (Stock or Modified)
    classification_choice = ""
    while classification_choice not in ['S', 'M']:
        classification_choice = input("\n4. Primary Batch Classification: Is this batch primarily [S]tock or [M]odified? ").upper()

    # Determine the final label and path based on the classification choice
    global_label = 'Stock' if classification_choice == 'S' else 'Modified'
    final_dir = STOCK_DIR_FINAL if classification_choice == 'S' else MODIFIED_DIR_FINAL
    
    # Create the final target directory structure (e.g., Car_Images_Dataset/BMW/Stock)
    os.makedirs(final_dir, exist_ok=True)
    
    print("-" * 50)
    print(f"Output folder structure prepared: {final_dir}")
    
    # --- PHASE 2: Scraping and Preview ---
    # Get the list of all image URLs
    urls_to_scrape = get_image_urls(TARGET_URL, HEADERS)
    
    if urls_to_scrape:
        # Determine how many images to preview (maximum of 3)
        preview_count = min(3, len(urls_to_scrape))
        
        # Call the verification function. If it returns True, proceed with download.
        if preview_images(urls_to_scrape, HEADERS, preview_count):
            # --- PHASE 3: Download and Labeling (Non-Interactive) ---
            download_images_batch(urls_to_scrape, HEADERS, car_name, global_label, final_dir)
        else:
            print("Operation canceled by user based on the preview. Exiting.")
    else:
        print("No image URLs were successfully extracted. Check the TARGET_URL setting.")
    
    print("\n" * 2)
    print("=" * 50)
    print(f"✅ Interactive Batch Labeling Session Concluded.")
    print(f"Data saved to the '{BASE_DIR}' folder.")
    print("=" * 50)

In [1]:
import requests # Used for making HTTP requests to download web pages and images
from bs4 import BeautifulSoup # Used for parsing HTML content
import os # Used for interacting with the operating system (creating directories, moving files)
import time # Used to introduce delays to be respectful to the server
import re # Used for regular expressions, specifically to extract file extensions
from PIL import Image # Pillow library, used for opening and displaying images locally
import json # Used for parsing JSON data embedded in the HTML
from urllib.parse import urlparse, parse_qs # Used for parsing and manipulating URLs

# --- CONFIGURATION ---

# 1. *** CHANGE THIS TO YOUR TARGET URL (e.g., a specific BaT listing) ***
# The script will scrape this URL for high-resolution images.
TARGET_URL = 'https://bringatrailer.com/listing/2016-bmw-340i-sedan-4/' 

# Base directory where all car make folders will be created.
BASE_DIR = 'Car_Images_Dataset' 

# 3. SET A USER-AGENT for direct image downloads
# This mimics a standard web browser request, which helps prevent blocks
# from the server that might deny requests from unidentified scripts.
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# --- HELPER FUNCTIONS ---

def get_image_urls(url, headers):
    """
    Fetches the HTML from the target URL and extracts all high-resolution 
    image URLs by parsing the embedded 'data-gallery-items' JSON attribute.
    
    :param url: The URL of the car listing to scrape.
    :param headers: The HTTP headers including the User-Agent.
    :return: A list of unique, cleaned image URLs.
    """
    print(f"Fetching HTML (via requests) from: {url}")
    
    try:
        # Send a GET request to the target URL
        response = requests.get(url, headers=headers, timeout=15)
        # Check for HTTP errors (4xx or 5xx)
        response.raise_for_status() 
        
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Locate the specific div tag that contains the image gallery data as a JSON string
        gallery_div = soup.find('div', {'data-gallery-items': True})
        
        if not gallery_div:
            print("ERROR: Could not find the 'data-gallery-items' div. Scraper logic may need updating.")
            return []

        # Extract the JSON string from the attribute
        json_string = gallery_div['data-gallery-items']
        # Convert the JSON string into a Python list/dictionary
        image_data = json.loads(json_string)
        
        image_urls = []
        for item in image_data:
            # Check if the 'large' image URL is available
            if 'large' in item and 'url' in item['large']:
                full_url = item['large']['url']
                
                # The URL often contains query parameters (e.g., ?fit=crop) that reduce resolution.
                # This section cleans the URL to get the base image path for maximum resolution.
                parsed_url = urlparse(full_url)
                clean_url = parsed_url.scheme + "://" + parsed_url.netloc + parsed_url.path
                
                if clean_url not in image_urls:
                    image_urls.append(clean_url)

        return list(set(image_urls)) # Return only unique URLs

    except requests.exceptions.RequestException as e:
        print(f"An error occurred during network request: {e}")
        return []
    except json.JSONDecodeError as e:
        print(f"An error occurred while parsing JSON from 'data-gallery-items': {e}")
        return []
    except Exception as e:
        print(f"An unexpected error occurred during extraction: {e}")
        return []

def save_temp_image(img_response):
    """
    Saves the image content from a requests response to a temporary file 
    (named 'temp_download.<ext>') and determines the file extension.

    :param img_response: The requests response object containing image data.
    :return: A tuple of (temporary file path, file extension).
    """
    
    # Check the Content-Type header to determine the correct file extension
    content_type = img_response.headers.get('Content-Type', '').lower()
    ext_match = re.search(r'image/(\w+)', content_type)
    ext = ext_match.group(1) if ext_match else 'jpg' # Default to jpg if type is unknown
    
    temp_file = f"temp_download.{ext}"
    
    # Open the temporary file in binary write mode
    with open(temp_file, 'wb') as f:
        # Write chunks of data to the file to handle large images efficiently
        for chunk in img_response.iter_content(1024):
            f.write(chunk)
    return temp_file, ext

def preview_images(image_urls, headers, count=3):
    """
    Downloads and displays the first 'count' images for visual verification 
    before starting the full batch download.
    
    :param image_urls: List of all extracted image URLs.
    :param headers: The HTTP headers.
    :param count: The number of images to preview (default is 3).
    :return: Boolean confirmation (True to proceed, False to cancel).
    """
    
    # Select only the first 'count' images for preview
    urls_to_preview = image_urls[:count]
    
    print("\n" + "=" * 50)
    print(f"--- IMAGE PREVIEW: Verification Step (Showing first {len(urls_to_preview)} images) ---")
    
    all_previews_successful = True
    temp_files = [] # List to track temporary files for cleanup

    for i, img_url in enumerate(urls_to_preview):
        # Create a unique temporary filename for each preview image
        current_temp_file_base = f"temp_preview_{i}"

        try:
            print(f"Downloading preview image {i+1}/{len(urls_to_preview)} from: {img_url}")
            
            # Download the image data
            img_response = requests.get(img_url, headers=headers, stream=True, timeout=10)
            img_response.raise_for_status()

            # Determine extension and path
            content_type = img_response.headers.get('Content-Type', '').lower()
            ext_match = re.search(r'image/(\w+)', content_type)
            ext = ext_match.group(1) if ext_match else 'jpg'
            current_temp_file = f"{current_temp_file_base}.{ext}"
            temp_files.append(current_temp_file)
            
            # Save the image to the unique temporary file
            with open(current_temp_file, 'wb') as f:
                for chunk in img_response.iter_content(1024):
                    f.write(chunk)
            
            # Use Pillow (PIL) to display the image for the user
            try:
                img = Image.open(current_temp_file)
                img.show()
                print(f"Image {i+1} is now displayed on your screen.")
            except Exception:
                # Fallback if automatic display fails
                print(f"Could not open image {i+1} automatically. Please check {os.path.abspath(current_temp_file)} manually.")

            time.sleep(0.5) # Wait half a second before trying to show the next one
            
        except requests.exceptions.RequestException as e:
            print(f"Error downloading preview image {i+1}. Skipping: {e}")
            all_previews_successful = False
            
        except Exception as e:
            print(f"An unexpected error occurred during preview {i+1}. Skipping: {e}")
            all_previews_successful = False
            
    # Final Confirmation Prompt
    print("\n" + "=" * 50)
    print("Verification Complete.")
    
    if not all_previews_successful and urls_to_preview:
        print("WARNING: Some preview images failed to download. Proceed with caution.")
        
    while True:
        confirm = input("Based on the 3 preview images, does the data look correct? Continue with the full batch? (Y/N): ").upper()
        if confirm in ['Y', 'N']:
            break
        print("Invalid input. Please enter Y or N.")

    # Clean up: Delete all temporary preview files
    for temp_file in temp_files:
        if os.path.exists(temp_file):
            try:
                os.remove(temp_file) 
            except OSError:
                pass

    return confirm == 'Y'

def download_images_batch(image_urls, headers, car_name, global_label, final_dir):
    """
    Downloads all images automatically and saves them using the global batch label.
    This function is non-interactive after the verification phase.

    :param image_urls: List of all image URLs to download.
    :param headers: The HTTP headers.
    :param car_name: Formatted string of the car (e.g., BMW_328i_2013). This is used for the filename.
    :param global_label: The primary classification ('Stock' or 'Modified') for all files.
    :param final_dir: The full hierarchical path to the final classification folder.
    """
    
    print(f"\n--- Batch Process Started (Automatic Download) ---")
    print(f"Car: {car_name} | Label Applied to All: {global_label} | Destination: {final_dir}")
    print(f"Processing {len(image_urls)} unique images...")
    
    temp_path = '' # Placeholder for the temporary file path
    
    # Ensure the final directory exists (created in main, but good to double-check)
    os.makedirs(final_dir, exist_ok=True)
    
    for i, img_url in enumerate(image_urls):
        try:
            print("-" * 50)
            print(f"Downloading Image [{i+1}/{len(image_urls)}]: {os.path.basename(urlparse(img_url).path)}")

            # 1. Download the image content
            img_response = requests.get(img_url, headers=headers, stream=True, timeout=10)
            img_response.raise_for_status()

            # 2. Save the image to a temporary file and get its extension
            temp_path, ext = save_temp_image(img_response) 
            
            # The label and directory are fixed based on the initial user input (non-interactive)
            current_label = global_label
            current_dir = final_dir
            
            # 3. Rename and Move
            timestamp = int(time.time() * 1000) # Use a timestamp to ensure unique filenames
            # Format: CarName_Label_Timestamp.ext
            new_filename = f"{car_name}_{current_label}_{timestamp}.{ext}"
            final_path = os.path.join(current_dir, new_filename)
            
            # Atomically rename/move the temporary file to its final, permanent location
            os.rename(temp_path, final_path)

            print(f"SUCCESS: Saved as '{new_filename}' to {current_dir}")
            
            # 4. Respectful delay
            time.sleep(0.2) 

        except requests.exceptions.RequestException as e:
            print(f"Error downloading {img_url}: {e}")
            if os.path.exists(temp_path): os.remove(temp_path)
            time.sleep(0.2) 
        except Exception as e:
            print(f"An unexpected error occurred during processing: {e}")
            if os.path.exists(temp_path): os.remove(temp_path)
            time.sleep(0.2) 
        finally:
            # Cleanup check for the general 'temp_download' file 
            # (only if an error occurred before rename)
            if os.path.exists(temp_path) and temp_path.startswith('temp_download.'):
                 try:
                     os.remove(temp_path)
                 except OSError:
                     pass


# --- MAIN EXECUTION ---

if __name__ == "__main__":
    
    # --- PHASE 1: User Input and Setup ---
    print("=" * 50)
    print("--- Car Image Scraper & Hierarchical Batch Labeler ---")
    print("=" * 50)
    
    # 1. Get Car Details for Folder Structure
    car_make = input("1. Enter Car Make (e.g., BMW): ").strip()
    car_model = input("2. Enter Car Model (e.g., 328i): ").strip()
    car_year = input("3. Enter Car Year (e.g., 2013): ").strip()
    
    # Format inputs for safe file and folder naming
    car_make_folder = car_make.replace(' ', '_').replace('-', '_').strip('_').upper()
    # This variable is used for the *filename* (e.g., BMW_328i_2013_Stock_1234.jpg)
    car_name = f"{car_make}_{car_model}_{car_year}".replace(' ', '_').replace('-', '_').strip('_')
    # This variable is used for the *directory* (e.g., 328i_2013)
    car_model_year_folder = f"{car_model}_{car_year}".replace(' ', '_').replace('-', '_').strip('_')

    if not car_name or not car_make_folder:
        print("Error: Car details cannot be empty. Exiting.")
        exit()

    # 2. Define Dynamic Paths based on user input (NEW HIERARCHY)
    MAKE_DIR = os.path.join(BASE_DIR, car_make_folder)
    # The new intermediate directory for the specific car model/year
    CAR_MODEL_DIR = os.path.join(MAKE_DIR, car_model_year_folder) 
    
    STOCK_DIR_FINAL = os.path.join(CAR_MODEL_DIR, 'Stock')
    MODIFIED_DIR_FINAL = os.path.join(CAR_MODEL_DIR, 'Modified')

    # 3. Get Batch Classification (Stock or Modified)
    classification_choice = ""
    while classification_choice not in ['S', 'M']:
        classification_choice = input("\n4. Primary Batch Classification: Is this batch primarily [S]tock or [M]odified? ").upper()

    # Determine the final label and path based on the classification choice
    global_label = 'Stock' if classification_choice == 'S' else 'Modified'
    final_dir = STOCK_DIR_FINAL if classification_choice == 'S' else MODIFIED_DIR_FINAL
    
    # Create the complete hierarchical directory structure 
    # (e.g., Car_Images_Dataset/BMW/328i_2013/Stock)
    os.makedirs(final_dir, exist_ok=True)
    
    print("-" * 50)
    print(f"Output folder structure prepared: {final_dir}")
    
    # --- PHASE 2: Scraping and Preview ---
    # Get the list of all image URLs
    urls_to_scrape = get_image_urls(TARGET_URL, HEADERS)
    
    if urls_to_scrape:
        # Determine how many images to preview (maximum of 3)
        preview_count = min(3, len(urls_to_scrape))
        
        # Call the verification function. If it returns True, proceed with download.
        if preview_images(urls_to_scrape, HEADERS, preview_count):
            # --- PHASE 3: Download and Labeling (Non-Interactive) ---
            download_images_batch(urls_to_scrape, HEADERS, car_name, global_label, final_dir)
        else:
            print("Operation canceled by user based on the preview. Exiting.")
    else:
        print("No image URLs were successfully extracted. Check the TARGET_URL setting.")
    
    print("\n" * 2)
    print("=" * 50)
    print(f"✅ Interactive Batch Labeling Session Concluded.")
    print(f"Data saved to the '{BASE_DIR}' folder.")
    print("=" * 50)

--- Car Image Scraper & Hierarchical Batch Labeler ---
--------------------------------------------------
Output folder structure prepared: Car_Images_Dataset\BMW\340iM_sport_2016\Stock
Fetching HTML (via requests) from: https://bringatrailer.com/listing/2016-bmw-340i-sedan-4/

--- IMAGE PREVIEW: Verification Step (Showing first 3 images) ---
Downloading preview image 1/3 from: https://bringatrailer.com/wp-content/uploads/2025/09/IMG_7733-21015-scaled.jpg
Image 1 is now displayed on your screen.
Downloading preview image 2/3 from: https://bringatrailer.com/wp-content/uploads/2025/09/IMG_7744-21106-scaled.jpg
Image 2 is now displayed on your screen.
Downloading preview image 3/3 from: https://bringatrailer.com/wp-content/uploads/2025/09/IMG_7705-20877-scaled.jpg
Image 3 is now displayed on your screen.

Verification Complete.

--- Batch Process Started (Automatic Download) ---
Car: BMW_340iM_sport_2016 | Label Applied to All: Stock | Destination: Car_Images_Dataset\BMW\340iM_sport_2016\