In [37]:
import os
import requests
import json
import requests
import os
from urllib.parse import urlparse

In [38]:
destination_folder = "./data/"  # Folder where you want to save the image

# How many images for each species
count_per_species = 10

# Specify the species name
species_name = "Impatiens glandulifera" # taxon id 47892

In [39]:
def save_image_from_url(url, id):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            file_path = os.path.join(destination_folder, f"{id}.jpg")
            with open(file_path, 'wb') as f:
                f.write(response.content)
            print(f"Image saved as: {file_path}")
            return True
        else:
            print("Failed to download image: HTTP status code", response.status_code)
            return False
    except Exception as e:
        print("An error occurred:", str(e))
        return False

In [40]:
# 1: medium sized images are in: taxon.default_photo.medium_url
# 2: medium sized images are in: previous_observation_taxon.default_photo.medium_url
# 3: medium sized images are in (sometimes several ancestors [{},{}]): taxon.ancestors.medium_url.default_photo.medium_url

In [1]:
def fetch_photos_for_species(species_name, destination_folder, count_per_species):
    # Check the destination folder for existing files
    existing_files = [f for f in os.listdir(destination_folder) if os.path.isfile(os.path.join(destination_folder, f))]
    existing_files_count = len(existing_files)
    print(f"Existing files in the folder: {existing_files_count}")

    # Proceed with fetching photos only if the existing files count is less than the desired count_per_species
    remaining_count = count_per_species - existing_files_count
    if remaining_count <= 0:
        print("Desired image count already reached. No need to fetch more.")
        return
    elif remaining_count < count_per_species:
        print(f"Need to fetch {remaining_count} more images.")

    api_url = f"https://api.inaturalist.org/v1/observations?q={species_name}" Impatiens glandulifera
    
    total_images_saved = 0

    try:
        page = 1
        default_photo_saved = False  # Flag to track whether default photo has been saved

        while total_images_saved < count_per_species:
            response = requests.get(api_url + f"&page={page}")
            print('page')
            print(page)
            
            if response.status_code == 200:
                data = response.json()
                results = data.get('results', [])

                print(results)

                # Iterate over each observation result
                for result in results:
                    # Check if there are ancestors and fetch their image URLs and IDs
                    ancestors_info = []
                    
                    if 'ancestors' in result['taxon']:
                        print(result['taxon']['ancestors'])
                    else:
                        continue

                    if 'taxon' in result and 'ancestors' in result['taxon']:
                        
                        for ancestor_id in result['taxon']['ancestors']:
                            ancestor_data = fetch_taxon_info(ancestor_id)
                            if ancestor_data:
                                image_url, image_id = get_image_info(ancestor_data)
                                ancestors_info.append({
                                    'id': ancestor_data['id'],
                                    'name': ancestor_data['name'],
                                    'image_url': image_url,
                                    'image_id': image_id
                                })

                    # Save the images for each ancestor
                    for ancestor_info in ancestors_info:
                        if ancestor_info['image_url']:
                            image_data = requests.get(ancestor_info['image_url']).content
                            with open(os.path.join(destination_folder, f"{ancestor_info['name']}_{ancestor_info['id']}.jpg"), 'wb') as f:
                                f.write(image_data)
                                total_images_saved += 1
                                print(f"Saved image for {ancestor_info['name']} with ID {ancestor_info['id']}")
                                if total_images_saved >= count_per_species:
                                    break

            page += 1

    except Exception as e:
        print("Error occurred:", e)

# Function to fetch taxon info by ID
def fetch_taxon_info(taxon_id):
    url = f"https://api.inaturalist.org/v1/taxa/{taxon_id}"
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()['results']
    return None

# Function to fetch image URL and ID for a taxon
def get_image_info(taxon):
    if 'default_photo' in taxon and taxon['default_photo']:
        return taxon['default_photo']['url'], taxon['default_photo']['id']
    return None, None

# Example usage
# fetch_photos_for_species("Impatiens glandulifera", "./data", 5)


In [82]:
def fetch_photos_for_species(species_name):
    # Check the destination folder for existing files
    existing_files = [f for f in os.listdir(destination_folder) if os.path.isfile(os.path.join(destination_folder, f))]
    existing_files_count = len(existing_files)
    print(f"Existing files in the folder: {existing_files_count}")

    # Proceed with fetching photos only if the existing files count is less than the desired count_per_species
    remaining_count = count_per_species - existing_files_count
    if remaining_count <= 0:
        print("Desired image count already reached. No need to fetch more.")
        return
    elif remaining_count < count_per_species:
        print(f"Need to fetch {remaining_count} more images.")

    api_url = f"https://api.inaturalist.org/v1/observations?q={species_name}"
    
    total_images_saved = 0

    try:
        page = 1
        default_photo_saved = False  # Flag to track whether default photo has been saved

        while total_images_saved < count_per_species:
            response = requests.get(api_url + f"&page={page}")
            print('page')
            print(page)
            
            if response.status_code == 200:
                data = response.json()
                results = data.get('results', [])

                if not results:
                    print('No results found')
                else:
                    for observation in results:
                        observation_taxon = observation.get('taxon', {})
                        observation_taxon_default_photo = observation_taxon.get('default_photo', {})

                        # Ancestor photos
                        observation_taxon_ancestors = observation_taxon.get('ancestors', [])
                        print(observation_taxon)
                        
                        if observation_taxon_ancestors:
                            print('observation_taxon_ancestors')
                            for ancestor in observation_taxon_ancestors:
                                ancestor_default_photo = ancestor.get('default_photo', {})
                                ancestor_default_photo_id = ancestor_default_photo.get('id', None)
                                ancestor_default_photo_url = ancestor_default_photo.get('medium_url', None)
                                save_image_from_url(ancestor_default_photo_url, ancestor_default_photo_id)

                        # Default photo (run only once)
                        if not default_photo_saved and observation_taxon_default_photo:
                            observation_taxon_default_photo_medium_url = observation_taxon_default_photo.get('medium_url', None)
                            observation_taxon_default_photo_id = observation_taxon_default_photo.get('id', None)
                            if observation_taxon_default_photo_id and observation_taxon_default_photo_medium_url:
                                save_image_from_url(observation_taxon_default_photo_medium_url, observation_taxon_default_photo_id)
                                default_photo_saved = True  # Set flag to True after saving

                        else:
                            print("No default photo found for this observation")

                        if total_images_saved >= count_per_species:
                            break

                    page += 1
            else:
                print(f"Error fetching data from iNaturalist API. Status code: {response.status_code}")
                break

        print(f"Total unique images saved: {total_images_saved}")
    except Exception as e:
        print(f"An error occurred: {str(e)}")

In [2]:
# Fetch and download photos for the specified species
# fetch_photos_for_species(species_name)

In [None]:
# Extract medium image URLs from observation_photos
medium_image_urls = []
for photo_info in data.get('observation_photos', []):
    photo = photo_info.get('photo', {})
    if 'medium_url' in photo:
        medium_image_urls.append(photo['medium_url'])

# Extract medium image URLs from ancestors
ancestors = data.get('taxon', {}).get('ancestors', [])
for ancestor in ancestors:
    photo = ancestor.get('default_photo', {})
    if 'medium_url' in photo:
        medium_image_urls.append(photo['medium_url'])
