# Imports

In [1]:
import os
import requests
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET

# Taxon IDS

In [2]:
taxon_ids = [1493141]

In [3]:
# max number to download
num_images = 10

# Functions

In [4]:
def extract_image_info(html_file):
    with open(html_file, "r", encoding="utf-8") as file:
        html_content = file.read()
    
    soup = BeautifulSoup(html_content, "html.parser")
    image_divs = soup.find_all("div", class_="CoverImage low undefined loaded")
    
    image_info = []
    for div in image_divs:
        style_attr = div.get("style")
        if style_attr:
            url_start_index = style_attr.find("url(") + len("url(")
            url_end_index = style_attr.find(")")
            image_url = style_attr[url_start_index:url_end_index].strip('"')
            
            # Extract image ID from URL
            second_last_slash_index = image_url.rfind("/", 0, image_url.rfind("/"))  # Find the index of the second to last slash
            image_id = image_url[second_last_slash_index + 1:image_url.rfind("/")]  # Extract the substring between the second to last slash and the last slash

            image_info.append((image_url, image_id))
    
    return image_info

# Function to check if all images for a taxon exist in the folder
def check_images_exist(image_info, taxon_id):
    taxon_folder = f"./data/{taxon_id}"
    if not os.path.exists(taxon_folder):
        return False
    
    num_existing_images = len([file for file in os.listdir(taxon_folder) if file.endswith(".jpg")])
    num_total_images = len(image_info)
    
    return num_existing_images == num_total_images

# Function to download and save images
def download_and_save_images(image_info, taxon_id, max_images=100):
    print(f"Downloading images for Taxon ID {taxon_id}. Total images available: {len(image_info)}")
    
    taxon_folder = f"./data/{taxon_id}"
    if not os.path.exists(taxon_folder):
        os.makedirs(taxon_folder)
    
    downloaded_count = 0
    for url, image_id in image_info:
        if downloaded_count >= max_images:
            print(f"Reached the limit of {max_images} images for Taxon ID {taxon_id}.")
            break
        
        # Download and save the image if it doesn't exist already
        image_path = f"{taxon_folder}/{image_id}.jpg"
        if not os.path.exists(image_path):
            with open(image_path, 'wb') as f:
                f.write(requests.get(url).content)
            downloaded_count += 1

# Download only images loop

1. Go to: https://www.inaturalist.org/taxa/{Taxon ID}. for example: "https://www.inaturalist.org/taxa/1493141"
2. On the link next to the images where it says 'show more' click. A page with an image gallery opens. Scroll down, keep scrolling.
3. When enough images have loaded, press F12. Copy the HTML element. 
4. Add the HTML content to a new file under data/websites.
5. Name the HTML file the same as the taxon id.

In [5]:
# Loop through each taxon id
for taxon_id in taxon_ids:
    # Construct the filename for the HTML file
    html_file = f"./data/websites/{taxon_id}.html"
    
    # Check if the HTML file exists
    if os.path.exists(html_file):
        # Extract image URLs and image IDs from the HTML file
        image_info = extract_image_info(html_file)
        
        # Check if all images for the taxon already exist
        if check_images_exist(image_info, taxon_id):
            print(f"All images for Taxon ID {taxon_id} already exist. Skipping...")
        else:
            # Download and save images
            download_and_save_images(image_info, taxon_id, max_images=num_images)
    else:
        print(f"HTML file for Taxon ID {taxon_id} does not exist.")

Downloading images for Taxon ID 1493141. Total images available: 0


# Download images and meta data

1. Go to: https://www.inaturalist.org/observations?place_id=any&subview=table&taxon_id={Taxon ID}. for example: "https://www.inaturalist.org/observations?place_id=any&subview=table&taxon_id=1493141"
2. This is a list of observations of this taxon.Scroll down, and keep scrolling.
3. When enough rows have loaded, press F12. Copy the HTML element. 
4. Add the HTML content to a new file under data/websites.
5. Name the HTML file the same as the taxon id.

In [8]:
def extract_image_info(html_file, limit=10):
    """Extract image information from HTML file, limited to a specified number of images."""
    with open(html_file, "r", encoding="utf-8") as file:
        html_content = file.read()
    
    soup = BeautifulSoup(html_content, "html.parser")
    observations = soup.find_all("tr", class_="ng-scope")
    
    image_info = []
    for obs in observations[:limit]:  # Process only up to 'limit' rows
        image_div = obs.find("a", class_="img")
        if image_div:
            style_attr = image_div.get("style")
            if style_attr and "background-image" in style_attr:
                # Extract the URL directly from the style attribute
                url_start_index = style_attr.find("url(") + len("url(")
                url_end_index = style_attr.find(")", url_start_index)
                image_url = style_attr[url_start_index:url_end_index].strip('"\'')

                # Extract image ID from URL
                image_id = image_url.split('/')[-2]
                
                date_span = obs.find("span", class_="date ng-binding")
                time_span = obs.find("span", class_="time ng-binding")
                place_td = obs.find("td", class_="place ng-binding")
                
                date = date_span.text.strip() if date_span else "unknown"
                time = time_span.text.strip() if time_span else "unknown"
                place = place_td.text.strip() if place_td else "unknown"
                
                image_info.append((image_url, image_id, date, time, place))
                if len(image_info) >= limit:
                    break
    
    return image_info

def save_metadata_as_xml(image_info, taxon_id):
    """Save metadata as XML files."""
    for url, image_id, date, time, place in image_info:
        taxon_folder = f"./data/{taxon_id}"
        if not os.path.exists(taxon_folder):
            os.makedirs(taxon_folder)
        
        # Create XML structure
        image_metadata = ET.Element("ImageMetadata")
        ET.SubElement(image_metadata, "ImageID").text = image_id
        ET.SubElement(image_metadata, "URL").text = url
        ET.SubElement(image_metadata, "Date").text = date
        ET.SubElement(image_metadata, "Time").text = time
        ET.SubElement(image_metadata, "Place").text = place
        
        # Save XML to file
        xml_str = ET.tostring(image_metadata, encoding="unicode")
        xml_path = os.path.join(taxon_folder, f"{image_id}.xml")
        with open(xml_path, "w", encoding="utf-8") as xml_file:
            xml_file.write(xml_str)

def download_and_save_images(image_info, taxon_id, max_images=100):
    """Download and save images."""
    print(f"Downloading images for Taxon ID {taxon_id}. Total images downloaded: {len(image_info)}")
    
    taxon_folder = f"./data/{taxon_id}"
    if not os.path.exists(taxon_folder):
        os.makedirs(taxon_folder)
    
    downloaded_count = 0
    for url, image_id, _, _, _ in image_info:
        if downloaded_count >= max_images:
            print(f"Reached the limit of {max_images} images for Taxon ID {taxon_id}.")
            break
        
        # Download and save the image if it doesn't exist already
        image_path = f"{taxon_folder}/{image_id}.jpg"
        if not os.path.exists(image_path):
            try:
                response = requests.get(url)
                response.raise_for_status()  # Raise an error for bad status codes
                with open(image_path, 'wb') as f:
                    f.write(response.content)
                downloaded_count += 1
            except requests.exceptions.RequestException as e:
                print(f"Failed to download {url}: {e}")

In [9]:
for taxon_id in taxon_ids:
    # Construct the filename for the HTML file
    html_file = f"./data/websites/{taxon_id}.html"
    
    # Check if the HTML file exists
    if os.path.exists(html_file):
        # Extract image URLs and image IDs from the HTML file
        image_info = extract_image_info(html_file, limit=num_images)
        
        if image_info:
            # Save metadata as XML files
            save_metadata_as_xml(image_info, taxon_id)

            # Download and save images
            download_and_save_images(image_info, taxon_id, max_images=num_images)
        else:
            print(f"No images found for Taxon ID {taxon_id}.")
    else:
        print(f"HTML file for Taxon ID {taxon_id} does not exist.")

Downloading images for Taxon ID 1493141. Total images downloaded: 10
