https://help.inaturalist.org/en/support/solutions/articles/151000170344

This file contains the code used to:

- Retrieve the id related to the taxa of the species
- Download the images present in iNaturalist for each species

through iNaturalist API calls.


Note that some of the species taxa were not found in iNaturalist, and were therefore dropped.

No results for lythrum americanum

No results for lythrum hyrcanicum

No results for lythrum lydiae

No results for lythrum nieuwlandii

No results for lythrum schelkovnikovii

No results for lythrum theodori

### Retrieving iNaturalist ids

In [1]:
import requests
import os
import time
from typing import List, Dict
import csv
from tqdm import tqdm
import json
from datetime import datetime, timezone, timedelta

import pandas as pd

In [2]:
# Constants for session tracking
SESSION_LOG_PATH = './support_files/download_session.json'
HOURLY_LIMIT = 5 * 1024**3  # 5 GB
DAILY_LIMIT = 24 * 1024**3  # 24 GB

def load_download_session():
    """Load or initialize the session data for download tracking."""
    if not os.path.exists(SESSION_LOG_PATH):
        return {
            "hour_start": datetime.now(timezone.utc).isoformat(),
            "day_start": datetime.now(timezone.utc).isoformat(),
            "bytes_downloaded_this_hour": 0,
            "bytes_downloaded_today": 0
        }
    with open(SESSION_LOG_PATH, 'r') as f:
        return json.load(f)

def save_download_session(session_data):
    """Save session data to disk."""
    folder = os.path.dirname(SESSION_LOG_PATH)
    if not os.path.exists(folder):
        os.makedirs(folder)
    with open(SESSION_LOG_PATH, 'w') as f:
        json.dump(session_data, f)

def check_and_reset_limits(session):
    """Reset hourly or daily byte counters if time has rolled over."""
    now = datetime.now(timezone.utc)
    hour_start = datetime.fromisoformat(session["hour_start"])
    if now - hour_start >= timedelta(hours=1):
        session["hour_start"] = now.isoformat()
        session["bytes_downloaded_this_hour"] = 0
    day_start = datetime.fromisoformat(session["day_start"])
    if now - day_start >= timedelta(days=1):
        session["day_start"] = now.isoformat()
        session["bytes_downloaded_today"] = 0
    return session

In [3]:
ACCEPTED_LICENSES = {'cc0', 'cc-by', 'cc-by-nc', 'cc-by-sa', 'cc-by-nc-sa', 'cc-by-nd', 'cc-by-nc-nd'}

#global daily API counter (no more than 10000)
api_call_count = 0

In [4]:
#This is where the species list with the correct name format is saved. It is the same as the file 'plant_zones_extraction'
species_list_path = 'support_files/invasive_plants_name_list.csv'

species_id_path = 'support_files/species_inaturalist_ids.csv'

#keep track of downloaded url
DOWNLOAD_LOG_PATH = './support_files/downloaded_photos.txt'

### Retrieving the IDs from the taxa

In [5]:
def get_taxon_id(scientific_name: str) -> int:
    """
    Given a scientific name (e.g. 'lythrum salicaria') return the corresponding iNaturalist id.
    """

    global api_call_count

    url = "https://api.inaturalist.org/v1/taxa"
    params = {'q' : scientific_name}
    #user agent is not necessary but recommended, to be able to be contacted by iNaturalist if needed
    headers = {'User-Agent' : 'InatImageDownloader/1.0 (s319848@studenti.polito.it)'}


    response = requests.get(url, params=params, headers=headers)

    time.sleep(0.5)

    if response.status_code != 200:
        print(f"Error fetching taxon for {scientific_name}: {response.status_code}")
        return None

    data = response.json()

    #example of response body:

#     {
#   "total_results": 1,
#   "page": 1,
#   "per_page": 30,
#   "results": [
#     {
#       "id": 61321,
#       "rank": "species",
#       "rank_level": 10,
#       "iconic_taxon_id": 47126,
#       "ancestor_ids": [
#         48460,
#         47126,
#         211194,
#         47125,
#         47124,
#         47791,
#         58935,
#         58937,
#         61321
#       ],
#       "is_active": true,
#       "name": "Lythrum salicaria",
#       "parent_id": 58937,
#       "ancestry": "48460/47126/211194/47125/47124/47791/58935/58937",
#       "extinct": false,
#       "default_photo": {
#         "id": 63744014,
#         "license_code": null,
#         "attribution": "(c) wojtest, all rights reserved, uploaded by wojtest",
#         "url": "https://static.inaturalist.org/photos/63744014/square.jpeg",
#         "original_dimensions": {
#           "height": 2048,
#           "width": 1362
#         },
#         "flags": [],
#         "attribution_name": "wojtest",
#         "square_url": "https://static.inaturalist.org/photos/63744014/square.jpeg",
#         "medium_url": "https://static.inaturalist.org/photos/63744014/medium.jpeg"
#       },
#       "taxon_changes_count": 1,
#       "taxon_schemes_count": 6,
#       "observations_count": 99403,
#       "flag_counts": {
#         "resolved": 2,
#         "unresolved": 0
#       },
#       "current_synonymous_taxon_ids": null,
#       "atlas_id": null,
#       "complete_species_count": null,
#       "wikipedia_url": "http://en.wikipedia.org/wiki/Lythrum_salicaria",
#       "matched_term": "Lythrum salicaria",
#       "iconic_taxon_name": "Plantae",
#       "preferred_common_name": "purple loosestrife"
#     }
#   ]
# }

    api_call_count += 1

    results = data.get('results', []) #if results key is absent, return an empty list

    if results:
        taxon_info = results[0]
        return int(taxon_info['id'])
    else:
        print(f"No results for {scientific_name}")
        return None

In [6]:
def retrieve_species_ids(species_list_path, species_id_path):

    """
    Given a file containing the list of species, retrieve the iNaturalist id for each taxon.
    Return a dataframe containing the species taxon and the corresponding iNaturalist id.
    """

    #dataframe containing the list of species. In this case, it also contains other info that we are not going to use in this use case (retrieving pictures)
    species_list = pd.read_csv(species_list_path)

    species_list_list = species_list['Species'].to_list()

    #call the function that retrieves the single id
    species_ids_list = [get_taxon_id(taxon) for taxon in species_list_list]

    species_list['inat_id'] = species_ids_list

    species_list = species_list.dropna(subset=['inat_id'])
    species_list['inat_id'] = species_list['inat_id'].astype(int)

    #save the new dataframe in a csv
    species_list.to_csv(species_id_path, index=False)

    return species_list



In [7]:
# species_id = retrieve_species_ids(species_list_path, species_id_path)

# species_id

### Downloading pictures and metadata for each species

In [8]:
def get_all_photo_metadata(taxon_id, max_api_calls=10000) -> List[Dict]:
    """Retrieve photo metadata for all observations of a taxon.
        Metadata include 
            - url
            - license code
            - observation id
            - quality grade
            - coordinates
    """

    global api_call_count
    photo_metadata = []
    page = 1
    per_page = 200
    headers = {'User-Agent' : 'InatImageDownloader/1.0 (s319848@studenti.polito.it)'}


    while True:
        if api_call_count >= max_api_calls:
            print("Reached daily API limits")
            break

        url = 'https://api.inaturalist.org/v1/observations'

        params = {
            'taxon_id' : taxon_id,
            'photos' : 'true',
            'page' : page,
            'per_page' : per_page,
            'order_by' : 'created_at'
        }

        response = requests.get(url, params=params, headers=headers)

        time.sleep(1)

        api_call_count +=1

        if response.status_code != 200:
            print(f"Error at page {page}, (HTTP {response.status_code})")
            break

        data = response.json()

        results = data.get('results', [])

        if not results:
            break


        for obs in tqdm(results, desc=f'Page {page}', leave=False):

            obs_id = obs['id']
            quality = obs.get('quality_grade')
            geo = obs.get('geojson') or {}
            coords = geo.get('coordinates', [None, None])
            lon, lat = coords


            for photo in obs.get('photos', []):

                if not photo:
                    continue

                license_code = photo.get('license_code', '')
                if not license_code:
                    continue
                license_code.lower()
                
                if license_code not in ACCEPTED_LICENSES:
                    continue
                    #if the photo has 'all rights reserved' copyright, skip it

                #You can change from "medium" it to "original" to increase quality, but this is useful to reduce the stress on the servers
                full_url = photo['url'].replace('square', 'medium')

                photo_metadata.append(
                    {
                        'url': full_url,
                        'license_code': license_code,
                        'observation_id': obs_id,
                        'quality_grade': quality,
                        'latitude': lat,
                        'longitude': lon
                    }
                )

        page += 1

    print(f'Retrieved {len(photo_metadata)} photo entries using {api_call_count} API calls')

    return photo_metadata

In [9]:
def save_downloaded_url(url, file_path=DOWNLOAD_LOG_PATH):
    """Write every photo url in a new line"""

    folder = os.path.dirname(file_path)
    if not os.path.exists(folder):
        os.makedirs(folder)
        
    with open(file_path, 'a') as f:
        f.write(url + '\n')

In [10]:
def load_downloaded_urls(file_path=DOWNLOAD_LOG_PATH) -> set:
    """From the file with the urls, return a set (unique urls) with the urls to work with"""
    
    if not os.path.exists(file_path):
        return set()
    
    with open(file_path, 'r') as f:
        return set(line.strip() for line in f if line.strip())

In [11]:
def download_photos(photo_metadata: List[Dict], species_name: str, downloaded_urls: set, download_log=DOWNLOAD_LOG_PATH):
    """This function downloads photos related to a given species, keeping track of the downloaded
    URLs to avoid downloading them twice and being able to resume the downloading if interrupted.
    
    Parameters:
        - photo_metadata: list of image metadata from iNaturalist, including URL
        - species_name: list of scientific names of the species
        - downloaded_urls: set of already downloaded urls to avoid downloading them twice
        - download_log: the file to save the newly downloaded photo URLs
    """

    species_dir = os.path.join('support_files', 'photos', species_name.replace(' ', '_'))

    if not os.path.exists(species_dir):
        os.makedirs(species_dir) #create folder if it doesn't exist yet

    downloaded_count = 0 #keep track of how many images we have downloaded in this session

    #load a json file that remembers when the last download session started and how many bytes 
    #have been downnloaded so far this hour and this day
    session = load_download_session()

    #check if hour/day has passed and eventually reset
    session = check_and_reset_limits(session)

    for i, photo in tqdm(enumerate(photo_metadata), total=len(photo_metadata), desc=f'{species_name}'):

        url = photo['url'] #retrieve the URL from the metadata

        if url in downloaded_urls:
            continue #skip this image if it is present in downloaded_urls (already been downloaded)

        try: 
            #send a GET request to the image URL (not the actual API call for the image)
            response = requests.get(url)

            #check if the server returned a successful response
            if response.status_code == 200:

                image_data = response.content
                image_size = len(image_data)

                # Check byte limits
                if session["bytes_downloaded_this_hour"] + image_size > HOURLY_LIMIT:
                    print("Hourly limit reached (5GB). Halting downloads.")
                    break
                if session["bytes_downloaded_today"] + image_size > DAILY_LIMIT:
                    print("Daily limit reached (24GB). Halting downloads.")
                    break

                #get the file extension (jpg, png) by splitting the URL at the dots
                ext = url.split('.')[-1].split('?')[0] #handles URLs  with ?query=string

                #create a file_name like "lythrum_salicaria_1.jpg"
                filename = f"{species_name.replace(' ', '_')}_{i+1:03}.{ext}"
                filepath = os.path.join(species_dir, filename)

                with open(filepath, 'wb') as f:
                    f.write(response.content)

                # Add the filename to the metadata dictionary - Apparently we are passing it by reference, so it is modifyied everywhere
                photo['filename'] = filename

                #log this URL to the file so we don't re-download it later
                save_downloaded_url(url, download_log)

                # Update counters
                session["bytes_downloaded_this_hour"] += image_size
                session["bytes_downloaded_today"] += image_size

                downloaded_count += 1

            else:
                print(f'Failed to download {url} (HTTP {response.status_code})')

        except Exception as e:
            print(f'Error downloading {url}: {e}')

        
        time.sleep(0.1)
        
    save_download_session(session)
    print(f'{downloaded_count} new images downloaded for {species_name}')

In [12]:
def save_metadata_csv(metadata_list: List[Dict], filename: str):
    """Save the picture metadata in a csv file"""

    folder = os.path.dirname(filename)
    if not os.path.exists(folder):
        os.makedirs(folder)

    keys = ['filename', 'url', 'license_code', 'observation_id', 'quality_grade', 'latitude', 'longitude']
    with open(filename, 'w', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=keys)
        writer.writeheader()
        writer.writerows(metadata_list)


### Usage

In [14]:
### MAIN FILE ###

if os.path.isfile(species_id_path):
    species_id_df = pd.read_csv(species_id_path)
else:
    species_id_df = retrieve_species_ids(species_list_path, species_id_path)

downloaded_urls = load_downloaded_urls()


#THIS ONLY KEEPS THE FIRST SPECIES, LYTHRUM ACUTANGULUM, WHICH HAS 24 PICTURES, GREAT FOR TESTING.
# test_df = species_id_df.head(1)

#UNCOMMENT THE FIRST LINE AND COMMENT THE SECOND TO USE THE WHOLE DATAFRAME INSTEAD OF ONLY ONE SPECIES
for index, row in species_id_df.iterrows():
# for index, row in test_df.iterrows():
    species = row['Species']
    taxon_id = row['inat_id']

    print(f'Processing species {species} (ID: {taxon_id})')

    photo_metadata = get_all_photo_metadata(taxon_id)

    downloaded_urls = load_downloaded_urls()

    download_photos(photo_metadata, species, downloaded_urls)

    save_metadata_csv(photo_metadata, f"./support_files/metadata/{species.replace(' ', '_')}_metadata.csv")
    downloaded_urls.update([photo['url'] for photo in photo_metadata])

Processing species lythrum acutangulum (ID: 1562194)


                                             

Retrieved 24 photo entries using 4 API calls


lythrum acutangulum: 100%|██████████| 24/24 [00:00<00:00, 24076.37it/s]

0 new images downloaded for lythrum acutangulum
Processing species lythrum alatum (ID: 128998)



                                               

KeyboardInterrupt: 