# 01 Data Collection

## Purpose:
This notebook is used to collect images for the animal subspecies dataset using web scraping or APIs.

## Steps:
1. Define classes and target URLs for image collection.
2. Use web scraping tools or APIs to download images.
3. Organize downloaded images into `data/raw/`.

In [14]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
import time
import os
import requests

# Set up ChromeDriver using the Service class
chrome_driver_path = r"C:\Tools\chromedriver-win64\chromedriver.exe"  # Ensure the path is correct
service = Service(chrome_driver_path)
driver = webdriver.Chrome(service=service)

# Define classes and keywords
classes = {
    "Domestic_Dogs": "Canis lupus familiaris",
    "North_American_Brown_Bear": "Ursus arctos horribilis",
    "Siberian_Tiger": "Panthera tigris altaica",
    "Domestic_Cats": "Felis silvestris catus",
    "African_Lion": "Panthera leo leo",
    "Asiatic_Lion": "Panthera leo persica",
    "Qinling_Panda": "Ailuropoda melanoleuca qinlingensis",
    "Sichuan_Panda": "Ailuropoda melanoleuca melanoleuca",
    "Himalayan_Red_Panda": "Ailurus fulgens fulgens",
    "Chinese_Red_Panda": "Ailurus fulgens styani"
}

# Directory to save images
output_dir = "data/raw"
os.makedirs(output_dir, exist_ok=True)

# Function to download images
def download_images_bing(query, folder_name, num_images=1000):
    search_url = f"https://www.bing.com/images/search?q={query}"
    driver.get(search_url)
    time.sleep(2)  # Allow the page to load

    # Scroll and load images incrementally
    downloaded_count = 0
    os.makedirs(os.path.join(output_dir, folder_name), exist_ok=True)

    scroll_attempt = 0
    max_scroll_attempts = 50  # Allow more scrolls to load additional images

    while downloaded_count < num_images and scroll_attempt < max_scroll_attempts:
        # Scroll down to load more images
        driver.execute_script("window.scrollBy(0,document.body.scrollHeight)")
        time.sleep(3)  # Allow new images to load

        # Find image elements
        images = driver.find_elements(By.CSS_SELECTOR, "img.mimg")
        scroll_attempt += 1

        for img in images:
            if downloaded_count >= num_images:
                break
            try:
                img_url = img.get_attribute("src")
                if not img_url:
                    img_url = img.get_attribute("data-src")
                
                # Skip base64-encoded URLs
                if img_url and img_url.startswith("data:image"):
                    print(f"Skipping base64 image for {folder_name}")
                    continue

                # Download the image
                response = requests.get(img_url)
                if response.status_code == 200:
                    with open(f"{output_dir}/{folder_name}/{folder_name}_{downloaded_count}.jpg", "wb") as file:
                        file.write(response.content)
                    downloaded_count += 1
                    print(f"Downloaded {downloaded_count}/{num_images} images for {folder_name}")
            except Exception as e:
                print(f"Error downloading image for {folder_name}: {e}")

        # If unable to load more images, break the loop
        if scroll_attempt == max_scroll_attempts:
            print(f"Max scroll attempts reached for {folder_name}. Stopping early.")

# Collect images for each class
for class_name, query in classes.items():
    print(f"Collecting images for: {class_name}")
    download_images_bing(query, class_name, num_images=1000)

# Close the driver
driver.quit()

Collecting images for: Domestic_Dogs
Skipping base64 image for Domestic_Dogs
Skipping base64 image for Domestic_Dogs
Skipping base64 image for Domestic_Dogs
Skipping base64 image for Domestic_Dogs
Skipping base64 image for Domestic_Dogs
Skipping base64 image for Domestic_Dogs
Skipping base64 image for Domestic_Dogs
Skipping base64 image for Domestic_Dogs
Downloaded 1/1000 images for Domestic_Dogs
Skipping base64 image for Domestic_Dogs
Downloaded 2/1000 images for Domestic_Dogs
Downloaded 3/1000 images for Domestic_Dogs
Downloaded 4/1000 images for Domestic_Dogs
Downloaded 5/1000 images for Domestic_Dogs
Downloaded 6/1000 images for Domestic_Dogs
Downloaded 7/1000 images for Domestic_Dogs
Downloaded 8/1000 images for Domestic_Dogs
Downloaded 9/1000 images for Domestic_Dogs
Downloaded 10/1000 images for Domestic_Dogs
Downloaded 11/1000 images for Domestic_Dogs
Downloaded 12/1000 images for Domestic_Dogs
Downloaded 13/1000 images for Domestic_Dogs
Downloaded 14/1000 images for Domestic_D