# 1 Import

In [1]:
import pandas as pd
import os
import csv
import requests
import gzip
import os
import requests
import boto3
from botocore.exceptions import NoCredentialsError

# 2 Species Names selected

In [2]:
species_list = [
    "Cortaderia jubata",
    "Cardiospermum grandiflorum",
    "Heracleum sosnowskyi",
    "Cenchrus setaceus",
    "Ailanthus altissima",
    "Lysichiton americanus",
    "Hakea sericea",
    "Lygodium japonicum",
    "Microstegium vimineum",
    "Heracleum mantegazzianum",
    "Lespedeza cuneata",
    "Triadica sebifera",
    "Pueraria montana var. Lobata",
    "Prosopis juliflora",
    "Gunnera tinctoria",
    "Baccharis halimifolia",
    "Asclepias syriaca",
    "Heracleum persicum",
    "Ehrharta calycina",
    "Andropogon virginicus"
]

# 3 Download CSV with photo ids in the terminal

(instructions are from https://github.com/inaturalist/inaturalist-open-data/tree/main/Metadata/Download) <br><br>

- Step 2: Download the photos.csv.gz file: <br>
aws s3 --no-sign-request --region us-east-1 cp s3://inaturalist-open-data/photos.csv.gz photos.csv.gz

- Step 3: Download the taxa.csv.gz file: <br>
aws s3 --no-sign-request --region us-east-1 cp s3://inaturalist-open-data/taxa.csv.gz taxa.csv.gz 

- Step 4: Download the observations.csv.gz file: <br>
aws s3 --no-sign-request --region us-east-1 cp s3://inaturalist-open-data/observations.csv.gz observations.csv.gz

- Step 5: Unzip both files manually!

# 4 Get the taxa.csv file loaded and find the photo ids we want

In [3]:
# Function to filter photos for given taxon IDs
def filter_photos_for_taxon_ids(photos_file_path, taxon_ids):
    filtered_photos = []
    with open(photos_file_path, 'r') as f:
        header = f.readline().strip().split(',')
        for line in f:
            row = line.strip().split(',')
            if len(row) >= 6 and int(row[5]) in taxon_ids:
                filtered_photos.append(row)
    return pd.DataFrame(filtered_photos, columns=header)

In [4]:
# Check if the taxons_data file exists
if os.path.exists('taxons_data.parquet'):
    # Load DataFrame from Parquet
    taxons_data = pd.read_parquet('taxons_data.parquet')
    print("DataFrame loaded from existing taxons_data file.")
else:
    # Load the taxonomic data from the CSV file with tab delimiter
    tax = pd.read_csv('./taxa.csv/taxa.csv', delimiter='\t')
    # Filter rows with species names from the list
    filtered_taxa = tax[tax['name'].isin(species_list)]
    # Extract a list of all taxon IDs and names from the filtered DataFrame
    taxons_data = filtered_taxa[['taxon_id', 'name']]
    # Combine taxon_ids and names into a DataFrame
    combined_data = pd.DataFrame({'Taxon IDs': taxons_data['taxon_id'], 'Names': taxons_data['name']})
    # Convert the DataFrame to a tab-separated string
    taxons_data = combined_data.to_csv(sep='\t', index=False)
    # Save DataFrame to Parquet
    taxons_data.to_parquet('taxons_data.parquet')
    # Print the tab-separated string
    print("DataFrame created and saved from taxons_data file.")

# Display the loaded DataFrame
print(taxons_data.head(1))

DataFrame loaded from existing taxons_data file.
       taxon_id                name
14447     48059  Prosopis juliflora


# 5 Get photos from the IDs

### Need to know which columns we got.

In [5]:
with open('./photos.csv/photos.csv', 'r') as csvfile:
    reader = csv.reader(csvfile)
    header = next(reader)
    print(header)

['photo_uuid\tphoto_id\tobservation_uuid\tobserver_id\textension\tlicense\twidth\theight\tposition']


### Make a smaller subset of the photos file

In [7]:
from dask import dataframe as dd
import time

start = time.time()
dask_df = dd.read_csv('./photos.csv/photos.csv')
end = time.time()
print("Read csv with dask: ",(end-start),"sec")

Read csv with dask:  0.05032920837402344 sec


In [8]:
dask_df.head()

Unnamed: 0,photo_uuid\tphoto_id\tobservation_uuid\tobserver_id\textension\tlicense\twidth\theight\tposition
0,8d6b2534-d30a-47a8-bc1c-986a21817997\t21213\t7...
1,6e8112fd-f703-4052-94da-b7cfc03ff3d4\t21216\t7...
2,49141c2f-48b0-4671-9cee-fce6efd11822\t21215\t0...
3,71090faa-9110-4df7-bb8f-af2415fe2e72\t21214\t5...
4,92c703d0-20f1-4da9-af9c-2a6bdb6db53b\t21217\te...


In [6]:
# Check if the photos_data file exists
if os.path.exists('photos_data.parquet'):
    print("DataFrame loaded from existing photos_data file.")
    # Load DataFrame from Parquet
    photos_data = pd.read_parquet('photos_data.parquet')
    print("DataFrame loaded from existing photos_data file.")
else:
    print("DataFrame created and saved from taxons_data file.")
    # Load specific columns from the CSV file
    columns_to_load = ['photo_id', 'observation_uuid']
    chunk_size = 1000 
    # Initialize an empty list to store chunks of data
    photo_chunks = []
    # Load the CSV file in chunks
    for chunk in pd.read_csv('./photos.csv/photos.csv', sep='\t', usecols=columns_to_load, chunksize=chunk_size):
        photo_chunks.append(chunk)
    photos_data = pd.concat(photo_chunks)
    photos_data.head(2)
    # Save DataFrame to Parquet
    taxons_data.to_parquet('photos_data.parquet')
    # Print the tab-separated string


# Display the loaded DataFrame
print(photos_data.head(1))

# Observations

In [None]:
# Check if the observations_data file exists
if os.path.exists('observations_data.parquet'):
    # Load DataFrame from Parquet
    observations_data = pd.read_parquet('observations_data.parquet')
    print("DataFrame loaded from existing observations_data file.")
else:
    # the observations zipped file is too large to unpack completely, we only take a small subset

    with gzip.open('./observations.csv/observations.csv.gz', 'rt') as f_in:
        with open('./observations.csv/observations_sample.csv', 'w') as f_out:
            for i, line in enumerate(f_in):
                if i >= 10000:  
                    break
                f_out.write(line)
    observations = pd.read_csv('./observations.csv/observations_sample.csv', delimiter='\t', quoting=csv.QUOTE_NONE)
    # Extract a list of all taxon IDs and names from the filtered DataFrame
    observations_data = observations[['taxon_id', 'observation_uuid']]
    # Save DataFrame to Parquet
    observations_data.to_parquet('observations_data.parquet')
    # Print the tab-separated string
    print("DataFrame created and saved from observations_data file.")

# Display the loaded DataFrame
print(observations_data.head(1))

# Merge and result

### The list of observers and photos

In [None]:
# Merge photos_data and observations_data on observer_id
# merged_data = pd.merge(photos_data, observations_data, on='observation_uuid')

In [None]:
# merged_data.head(1)

### The list of the photos with the specific taxons i want.

In [None]:
# Extract unique taxon_ids from taxon_df
# valid_taxon_ids = taxons_data['taxon_id']

# Filter merged_data to contain only rows with taxon_ids present in valid_taxon_ids
# filtered_data = merged_data[merged_data['taxon_id'].isin(valid_taxon_ids)]

In [None]:
# len(filtered_data)

In [None]:
# filtered_data.head(5)

In [None]:
# Grouping the filtered_data DataFrame by 'taxon_id' and counting the number of 'photo_ids' per group
# photo_count_per_taxon = filtered_data.groupby('taxon_id')['photo_id'].count()

# Displaying the result
# print(photo_count_per_taxon)


# Download the images

In [None]:
def download_image(url, folder):
    # Extract photo_id from the URL
    photo_id = url.split('/')[-2]

    # Ensure folder exists
    os.makedirs(folder, exist_ok=True)

    # Download the image
    response = requests.get(url)
    if response.status_code == 200:
        with open(os.path.join(folder, f'{photo_id}.jpg'), 'wb') as f:
            f.write(response.content)
        print(f"Downloaded {photo_id}.jpg")
    else:
        print(f"Failed to download {photo_id}.jpg")

In [None]:
def fetch_image_urls(photo_ids):
    # Construct S3 URLs for the images
    s3_urls = [f'https://inaturalist-open-data.s3.amazonaws.com/photos/{photo_id}/original.jpg' for photo_id in photo_ids]
    return s3_urls

In [None]:

# list of photo_ids
# photo_ids = filtered_data['photo_id']  

# Fetch image URLs
# image_urls = fetch_image_urls(photo_ids)

# Choose how many images per species
# images_per_species = 5  # Replace with your desired number

# Download and save images
# for url in image_urls:
#     taxon_id = url.split('/')[-3]  # Extract taxon_id from the URL
#     folder = os.path.join('photos', taxon_id)
#     download_image(url, folder)

