In [1]:
import time
import os
import requests
import shutil

from PIL import Image

import matplotlib.pyplot as plt
import cv2
from IPython.display import display, clear_output
from io import BytesIO

In [6]:
def load_api_key(file_path="../config.txt"):
    """Load the API key from a text file."""
    try:
        with open(file_path, "r") as file:
            for line in file:
                if line.startswith("BING_API_KEY"):
                    return line.strip().split("=")[1]
        raise ValueError("API key not found in config file.")
    except FileNotFoundError:
        raise FileNotFoundError(f"Config file not found at: {file_path}")

def scrape_bing_images(query, label, output_folder, count=100):
    """Scrape images using Bing Image Search API."""
    api_key = load_api_key()
    if not api_key:
        raise ValueError("API key not found or invalid.")

    url = "https://api.bing.microsoft.com/v7.0/images/search"
    headers = {"Ocp-Apim-Subscription-Key": api_key}
    params = {"q": query, "count": 50, "offset": 0, "imageType": "photo"}  # Start with 50 results per request

    os.makedirs(output_folder, exist_ok=True)

    downloaded = 0
    offset = 0  # For pagination
    attempts = 0  # Track attempts to prevent infinite loops

    while downloaded < count and attempts < 10:  # Limit to 10 pagination requests
        # Update offset for pagination
        params["offset"] = offset
        print(f"Fetching images starting at offset {offset}...")

        try:
            response = requests.get(url, headers=headers, params=params)
            response.raise_for_status()
            results = response.json()

            for idx, image in enumerate(results.get("value", [])):
                if downloaded >= count:
                    break  # Stop if we've reached the desired count

                try:
                    image_url = image.get("contentUrl")
                    if not image_url:
                        print(f"Image URL missing for result {idx + offset}. Skipping.")
                        continue

                    # Fetch the image
                    img_response = requests.get(image_url, timeout=10)
                    img_response.raise_for_status()

                    # Validate if the image is openable
                    try:
                        img = Image.open(BytesIO(img_response.content))
                        img.verify()  # Verify that it's a valid image
                    except Exception as e:
                        print(f"Invalid image at {image_url}: {e}")
                        continue

                    # Save the valid image
                    image_path = os.path.join(output_folder, f"{label.replace(' ', '_')}_{downloaded:04d}.jpg")
                    with open(image_path, "wb") as img_file:
                        img_file.write(img_response.content)
                        downloaded += 1
                        print(f"Downloaded ({downloaded}/{count}): {image_url}")
                except Exception as e:
                    print(f"Failed to process image {idx + offset}: {e}")
                    continue  # Skip to the next image

            # Increment offset to fetch the next set of images
            offset += 50
            attempts += 1
        except Exception as e:
            print(f"Failed to fetch results at offset {offset}: {e}")
            break  # Stop fetching if there's an issue with the request

    print(f"Download completed. {downloaded}/{count} images saved to '{output_folder}'.")

selected_images = []

def review_images_cli(image_folder, output_folder):
    """Review images in the terminal and accept or reject them."""
    # Load images
    images = [os.path.join(image_folder, f) for f in os.listdir(image_folder) if f.endswith('.jpg')]
    os.makedirs(output_folder, exist_ok=True)
    accepted_images = []

    for idx, img_path in enumerate(images):
        try:
            # Load the image
            img = Image.open(img_path)
            
            # Display the image using IPython.display
            clear_output(wait=True)  # Clear the previous image
            print(f"Reviewing image {idx + 1}/{len(images)}: {img_path}")
            display(img)

            # Get user input for the action
            action = input("Enter 'a' to accept, 'r' to reject, 's' to skip, or 'q' to quit: ").strip().lower()

            # Clear and close the plot after the action
            plt.close()

            if action == 'a':
                # Move the image to the output folder
                accepted_images.append(img_path)
                print(f"Accepted: {img_path}")
            elif action == 'r':
                print(f"Rejected: {img_path}")
            elif action == 's':
                print(f"Skipped: {img_path}")
            elif action == 'q':
                print("Exiting review process.")
                break
            else:
                print("Invalid input. Skipping this image.")
        except Exception as e:
            print(f"Failed to show image")
            continue  # Skip to the next image

    # Copy accepted images to the output folder
    for img_path in accepted_images:
        shutil.copy(img_path, os.path.join(output_folder, os.path.basename(img_path)))

    print(f"Review complete. {len(accepted_images)} images accepted and saved to {output_folder}.")

def save_to_kaggle_folder(selected_images, dataset_folder, class_name):
    """Saves selected images to Kaggle dataset structure."""
    class_folder = os.path.join(dataset_folder, class_name)
    os.makedirs(class_folder, exist_ok=True)
    
    for i, img_path in enumerate(selected_images):
        new_name = f"{class_name}_{i+1:04d}.jpg"
        shutil.copy(img_path, os.path.join(class_folder, new_name))

In [9]:
query_tags = [
    ["Eastern Gray Squirrel", "eastern_gray_squirrel"],
    ["Black Eastern Gray Squirrel","melanistic_eastern_gray_squirrel"]
]

for query, label in query_tags:
    scrape_bing_images(query, label, f"../data/sf_bay_backyard_animals/{label}", 250)

Fetching images starting at offset 0...
Downloaded (1/250): https://i.natgeofe.com/k/802ef619-7e16-4796-be4c-c48e2ce5c8c9/eastern-gray-squirrel-standing_3x2.jpg
Failed to process image 1: 403 Client Error: Forbidden. Please comply with the User-Agent policy: https://meta.wikimedia.org/wiki/User-Agent_policy for url: https://upload.wikimedia.org/wikipedia/commons/7/7c/Eastern_Grey_Squirrel_in_St_James's_Park%2C_London_-_Nov_2006_edit.jpg
Downloaded (2/250): https://projectupland.com/wp-content/uploads/2019/06/Eastern-Gray-Squirrel-Sciurus-carolinensis-3.jpg
Failed to process image 3: 403 Client Error: Forbidden. Please comply with the User-Agent policy: https://meta.wikimedia.org/wiki/User-Agent_policy for url: https://upload.wikimedia.org/wikipedia/commons/9/97/Eastern_Grey_Squirrel_Beacon_Hill_Park.jpg
Downloaded (3/250): https://cdn.britannica.com/55/145555-050-2808426A/squirrels-red-North-American-United-Kingdom-most.jpg
Downloaded (4/250): https://i.pinimg.com/originals/4f/2a/1f/4f

In [None]:
review_images_cli("../data/images_to_review", "../data/scraped_images")

In [None]:
save_to_kaggle_folder("../data/scraped_images", "../data/sf_bay_backyard_animals", "dark_eyed_junco")s