# Sample images

1. Get image URLs and labels from LILA BC
2. For each selected species: sample n images and create train test split

## Setup

In [1]:
from google.colab import drive

import os
import numpy as np
import pandas as pd

import subprocess

In [2]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# project dir
project_dir = 'drive/MyDrive/TeraiNet'

# scripts dir
scripts_dir = project_dir + '/scripts/'
get_image_urls_script = scripts_dir + 'get_image_urls_from_lila_bc.sh'

# samples dir
samples_dir = 'samples'

# set train ratio
train_ratio = 0.8

In [4]:
# for a fresh start, remove samples dir
remove_samples_dir = True
if remove_samples_dir:
  !rm -rf "$samples_dir"
  !mkdir -p "$samples_dir"
  !rm -rf "$project_dir/$samples_dir"
  !mkdir -p "$project_dir/$samples_dir"

In [5]:
def run_get_image_urls_script(get_image_urls_script, urls_and_labels, column_to_filter, values_to_filter, samples_file):

    command = [
        "bash",
        get_image_urls_script,
        urls_and_labels,
        column_to_filter,
        values_to_filter,
        samples_file
    ]

    result = subprocess.run(command, capture_output=True, text=True)

    return result.stdout, result.stderr

In [6]:
def get_image_urls(
    project_dir: str,
    samples_dir: str,
    species: str,
    get_image_urls_script: str,
    urls_and_labels: str,
    column_to_filter: str,
    values_to_filter: str
  ):
  """
    Retrieves image URLs for a specified species, filters them based on given criteria,
    and assigns a train/test subset label.

    Parameters:
    - project_dir (str): The base directory of the project.
    - samples_dir (str): The directory where the samples file will be saved.
    - species (str): The target species for filtering image URLs.
    - get_image_urls_script (str): Path to the script used for retrieving image URLs.
    - urls_and_labels (str): Path to the CSV file containing image URLs and labels.
    - column_to_filter (str): The column in the CSV file used for filtering (e.g., 'scientific_name').
    - values_to_filter (str): Comma-separated values to filter within the specified column.

    Returns:
    The function writes the filtered image URLs to a CSV file and returns the file path.
  """

  # get image urls
  samples_file = samples_dir + '/lila_bc_image_urls_' + species + '.csv'
  stdout, stderr = run_get_image_urls_script(get_image_urls_script, urls_and_labels, column_to_filter, values_to_filter, samples_file)
  print(stdout)
  print(stderr)

  return samples_file

In [7]:
def sample_n_images_per_species(image_urls: str, species_samples_dict: dict, column_to_filter: str, seed: int = 42) -> pd.DataFrame:
    """
    Filters and randomly samples a specified number of rows for each species from a CSV file.

    Parameters:
    - image_urls (str): Path to the image URLs CSV file.
    - species_samples_dict (dict): A dictionary where keys are species names,
      and values are the number of samples to draw for each species.
    - column_to_filter (str): The column name used for filtering species.
    - seed (int, optional): Seed for reproducibility. Default is 42.

    Returns:
    - pd.DataFrame: A DataFrame containing the sampled rows for each species.
    """
    # Load the CSV file
    df = pd.read_csv(image_urls, low_memory=False)

    # Initialize an empty list to store sampled DataFrames
    sampled_dfs = []

    # Sample rows per species
    for species, sample_size in species_samples_dict.items():
        species_df = df[df[column_to_filter] == species]
        sampled_df = species_df.sample(n=min(sample_size, len(species_df)), random_state=seed)
        sampled_dfs.append(sampled_df)

    # Concatenate all sampled data into a single DataFrame
    return pd.concat(sampled_dfs, ignore_index=True) if sampled_dfs else pd.DataFrame()

In [8]:
def add_subset_column(df: pd.DataFrame, train_ratio: float, seed: int = 42) -> pd.DataFrame:
    """
    Adds a 'subset' column to the DataFrame, splitting data into 'train' and 'test' subsets.

    Parameters:
        df (pd.DataFrame): The input DataFrame.
        train_ratio (float): The ratio of the 'train' subset (e.g., 0.8 for 80% train).
        seed (int, optional): Seed for reproducibility. Default is 42.

    Returns:
        pd.DataFrame: The DataFrame with an additional 'subset' column.
    """
    if not 0 <= train_ratio <= 1:
        raise ValueError('train_ratio must be between 0 and 1.')

    # Set the random seed for reproducibility
    np.random.seed(seed)

    # Generate random values to assign subsets
    random_values = np.random.rand(len(df))

    # Assign subsets based on the train_ratio
    df['subset'] = np.where(random_values < train_ratio, 'train', 'test')

    return df

## Download image URLs and labels from LILA BC

In [9]:
!wget -O lila_image_urls_and_labels.csv.zip -nc "https://lila.science/public/lila_image_urls_and_labels.csv.zip"

File ‘lila_image_urls_and_labels.csv.zip’ already there; not retrieving.


In [10]:
![ -f lila_image_urls_and_labels.csv ] || unzip lila_image_urls_and_labels.csv.zip

In [11]:
urls_and_labels = 'lila_image_urls_and_labels.csv'

## Inspect species counts

Check taxonomy mapping to find relevant species:
https://lila.science/public/lila-taxonomy-mapping_release.csv

In [12]:
lila_image_urls_and_labels_df = pd.read_csv('lila_image_urls_and_labels.csv', usecols=['common_name'])
lila_image_urls_and_labels_df.shape

  lila_image_urls_and_labels_df = pd.read_csv('lila_image_urls_and_labels.csv', usecols=['common_name'])


(23591744, 1)

In [13]:
species_list = [
    "tiger",
    "leopard",
    "asian black bear", "american black bear", # not enough images of asian black bear alone
    "dhole", "black-backed jackal", "gray fox", "leopard cat", "mainland leopard cat", "marbled cat", "asian golden cat", # other carnivores (including substitutes, i. e. black-backed jackal and gray fox)
    "deer",
    "wild boar",
    "african buffalo", "cape buffalo", # substitute for gaur
    "white rhinoceros", # substitute for indian rhino
    "asian elephant", "african elephant", "african bush elephant", # not enough images of asian elephant alone
    "bird",
]

In [14]:
species_counts = lila_image_urls_and_labels_df['common_name'].value_counts().reindex(species_list, fill_value=0)
species_counts

Unnamed: 0_level_0,count
common_name,Unnamed: 1_level_1
tiger,321
leopard,2991
asian black bear,1221
american black bear,32854
dhole,185
black-backed jackal,13287
gray fox,19794
leopard cat,266
mainland leopard cat,246
marbled cat,271


## Get image URLs from LILA BC

The goal is to get about 2000 images per class (+100 buffer). Leopard images availability sets this boundary because we want classes to be balanced.

### Tiger

In [15]:
species = 'tiger'
column_to_filter = 'scientific_name'
values_to_filter = 'panthera tigris'
samples_file = get_image_urls(project_dir, samples_dir, species, get_image_urls_script, urls_and_labels, column_to_filter, values_to_filter)

Getting image URLs based on scientific_name equal to any of {panthera tigris}...
Total matching rows found: 321
Filtering complete. Result written to samples/lila_bc_image_urls_tiger.csv.




In [16]:
# use all tiger images because we put them into a separate test for later evaluation
species_samples_dict = {
    'panthera tigris': 321,
}

sampled_image_urls = sample_n_images_per_species(samples_file, species_samples_dict, column_to_filter)
sampled_image_urls = add_subset_column(sampled_image_urls, train_ratio)
sampled_samples_file = project_dir + '/' + samples_dir + '/lila_bc_image_urls_' + species + '_sampled.csv'
sampled_image_urls.to_csv(sampled_samples_file, index=False)
!echo "Sampled rows: $(tail -n +2 "$sampled_samples_file" | wc -l)"
sampled_image_urls[column_to_filter].value_counts()

Sampled rows: 321


Unnamed: 0_level_0,count
scientific_name,Unnamed: 1_level_1
panthera tigris,321


### Leopard

In [17]:
species = 'leopard'
column_to_filter = 'common_name'
values_to_filter = 'leopard'
samples_file = get_image_urls(project_dir, samples_dir, species, get_image_urls_script, urls_and_labels, column_to_filter, values_to_filter)

Getting image URLs based on common_name equal to any of {leopard}...
Total matching rows found: 2991
Filtering complete. Result written to samples/lila_bc_image_urls_leopard.csv.




In [18]:
# use all leopard images because ~900 fail to download later on
species_samples_dict = {
    'leopard': 2991,
}

sampled_image_urls = sample_n_images_per_species(samples_file, species_samples_dict, column_to_filter)
sampled_image_urls = add_subset_column(sampled_image_urls, train_ratio)
sampled_samples_file = project_dir + '/' + samples_dir + '/lila_bc_image_urls_' + species + '_sampled.csv'
sampled_image_urls.to_csv(sampled_samples_file, index=False)
!echo "Sampled rows: $(tail -n +2 "$sampled_samples_file" | wc -l)"
sampled_image_urls[column_to_filter].value_counts()

Sampled rows: 2991


Unnamed: 0_level_0,count
common_name,Unnamed: 1_level_1
leopard,2991


### Black bear

Include `american black bear` because there are not enough camera trap images of Asian black bears.

In [19]:
species = 'black_bear'
column_to_filter = 'common_name'
values_to_filter = 'asian black bear,american black bear'
samples_file = get_image_urls(project_dir, samples_dir, species, get_image_urls_script, urls_and_labels, column_to_filter, values_to_filter)

Getting image URLs based on common_name equal to any of {asian black bear,american black bear}...
Total matching rows found: 34075
Filtering complete. Result written to samples/lila_bc_image_urls_black_bear.csv.




In [20]:
species_samples_dict = {
    'asian black bear': 1221,
    'american black bear': 879,
}

sampled_image_urls = sample_n_images_per_species(samples_file, species_samples_dict, column_to_filter)
sampled_image_urls = add_subset_column(sampled_image_urls, train_ratio)
sampled_samples_file = project_dir + '/' + samples_dir + '/lila_bc_image_urls_' + species + '_sampled.csv'
sampled_image_urls.to_csv(sampled_samples_file, index=False)
!echo "Sampled rows: $(tail -n +2 "$sampled_samples_file" | wc -l)"
sampled_image_urls[column_to_filter].value_counts()

Sampled rows: 2100


Unnamed: 0_level_0,count
common_name,Unnamed: 1_level_1
asian black bear,1221
american black bear,879


### Other carnivores

Include `dhole,black-backed jackal,gray fox,leopard cat,mainland leopard cat,marbled cat,asian golden cat` to cover a wide range of other carnivores in the Terai ecosystem.

In [21]:
species = 'other_carnivores'
column_to_filter = 'common_name'
values_to_filter = 'dhole,black-backed jackal,gray fox,leopard cat,mainland leopard cat,marbled cat,asian golden cat'
samples_file = get_image_urls(project_dir, samples_dir, species, get_image_urls_script, urls_and_labels, column_to_filter, values_to_filter)

Getting image URLs based on common_name equal to any of {dhole,black-backed jackal,gray fox,leopard cat,mainland leopard cat,marbled cat,asian golden cat}...
Total matching rows found: 34402
Filtering complete. Result written to samples/lila_bc_image_urls_other_carnivores.csv.




In [22]:
species_samples_dict = {
    'dhole': 185,
    'black-backed jackal': 522,
    'gray fox': 522,
    'leopard cat': 150,
    'mainland leopard cat': 150,
    'marbled cat': 271,
    'asian golden cat': 300,
}

sampled_image_urls = sample_n_images_per_species(samples_file, species_samples_dict, column_to_filter)
sampled_image_urls = add_subset_column(sampled_image_urls, train_ratio)
sampled_samples_file = project_dir + '/' + samples_dir + '/lila_bc_image_urls_' + species + '_sampled.csv'
sampled_image_urls.to_csv(sampled_samples_file, index=False)
!echo "Sampled rows: $(tail -n +2 "$sampled_samples_file" | wc -l)"
sampled_image_urls[column_to_filter].value_counts()

Sampled rows: 2100


Unnamed: 0_level_0,count
common_name,Unnamed: 1_level_1
black-backed jackal,522
gray fox,522
asian golden cat,300
marbled cat,271
dhole,185
leopard cat,150
mainland leopard cat,150


### Deer

In [23]:
species = 'deer'
column_to_filter = 'common_name'
values_to_filter = 'deer'
samples_file = get_image_urls(project_dir, samples_dir, species, get_image_urls_script, urls_and_labels, column_to_filter, values_to_filter)

Getting image URLs based on common_name equal to any of {deer}...
Total matching rows found: 360489
Filtering complete. Result written to samples/lila_bc_image_urls_deer.csv.




In [24]:
species_samples_dict = {
    'deer': 2100,
}

sampled_image_urls = sample_n_images_per_species(samples_file, species_samples_dict, column_to_filter)
sampled_image_urls = add_subset_column(sampled_image_urls, train_ratio)
sampled_samples_file = project_dir + '/' + samples_dir + '/lila_bc_image_urls_' + species + '_sampled.csv'
sampled_image_urls.to_csv(sampled_samples_file, index=False)
!echo "Sampled rows: $(tail -n +2 "$sampled_samples_file" | wc -l)"
sampled_image_urls[column_to_filter].value_counts()

Sampled rows: 2100


Unnamed: 0_level_0,count
common_name,Unnamed: 1_level_1
deer,2100


### Wild boar

In [25]:
species = 'wild_boar'
column_to_filter = 'common_name'
values_to_filter = 'wild boar'
samples_file = get_image_urls(project_dir, samples_dir, species, get_image_urls_script, urls_and_labels, column_to_filter, values_to_filter)

Getting image URLs based on common_name equal to any of {wild boar}...
Total matching rows found: 142701
Filtering complete. Result written to samples/lila_bc_image_urls_wild_boar.csv.




In [26]:
species_samples_dict = {
    'wild boar': 2100,
}

sampled_image_urls = sample_n_images_per_species(samples_file, species_samples_dict, column_to_filter)
sampled_image_urls = add_subset_column(sampled_image_urls, train_ratio)
sampled_samples_file = project_dir + '/' + samples_dir + '/lila_bc_image_urls_' + species + '_sampled.csv'
sampled_image_urls.to_csv(sampled_samples_file, index=False)
!echo "Sampled rows: $(tail -n +2 "$sampled_samples_file" | wc -l)"
sampled_image_urls[column_to_filter].value_counts()

Sampled rows: 2100


Unnamed: 0_level_0,count
common_name,Unnamed: 1_level_1
wild boar,2100


### Buffalo

Use `african buffalo,cape buffalo` because camera trap images of gaur are unavailable.

In [27]:
species = 'buffalo'
column_to_filter = 'common_name'
values_to_filter = 'african buffalo,cape buffalo'
samples_file = get_image_urls(project_dir, samples_dir, species, get_image_urls_script, urls_and_labels, column_to_filter, values_to_filter)

Getting image URLs based on common_name equal to any of {african buffalo,cape buffalo}...
Total matching rows found: 94114
Filtering complete. Result written to samples/lila_bc_image_urls_buffalo.csv.




In [28]:
species_samples_dict = {
    'african buffalo': 1050,
    'cape buffalo': 1050,
}

sampled_image_urls = sample_n_images_per_species(samples_file, species_samples_dict, column_to_filter)
sampled_image_urls = add_subset_column(sampled_image_urls, train_ratio)
sampled_samples_file = project_dir + '/' + samples_dir + '/lila_bc_image_urls_' + species + '_sampled.csv'
sampled_image_urls.to_csv(sampled_samples_file, index=False)
!echo "Sampled rows: $(tail -n +2 "$sampled_samples_file" | wc -l)"
sampled_image_urls[column_to_filter].value_counts()

Sampled rows: 2100


Unnamed: 0_level_0,count
common_name,Unnamed: 1_level_1
african buffalo,1050
cape buffalo,1050


### Rhino

Use `white rhinoceros` because camera trap images of Indian rhinoceros are unavailable.

In [36]:
species = 'rhino'
column_to_filter = 'common_name'
values_to_filter = 'white rhinoceros'
samples_file = get_image_urls(project_dir, samples_dir, species, get_image_urls_script, urls_and_labels, column_to_filter, values_to_filter)

Getting image URLs based on common_name equal to any of {white rhinoceros}...
Total matching rows found: 7307
Filtering complete. Result written to samples/lila_bc_image_urls_rhino.csv.




In [37]:
species_samples_dict = {
    'white rhinoceros': 2100,
}

sampled_image_urls = sample_n_images_per_species(samples_file, species_samples_dict, column_to_filter)
sampled_image_urls = add_subset_column(sampled_image_urls, train_ratio)
sampled_samples_file = project_dir + '/' + samples_dir + '/lila_bc_image_urls_' + species + '_sampled.csv'
sampled_image_urls.to_csv(sampled_samples_file, index=False)
!echo "Sampled rows: $(tail -n +2 "$sampled_samples_file" | wc -l)"
sampled_image_urls[column_to_filter].value_counts()

Sampled rows: 2100


Unnamed: 0_level_0,count
common_name,Unnamed: 1_level_1
white rhinoceros,2100


### Elephant

Include `african elephant,african bush elephant` because there are not enough camera trap images of Asian elephants.

In [31]:
species = 'elephant'
column_to_filter = 'common_name'
values_to_filter = 'asian elephant,african elephant,african bush elephant'
samples_file = get_image_urls(project_dir, samples_dir, species, get_image_urls_script, urls_and_labels, column_to_filter, values_to_filter)

Getting image URLs based on common_name equal to any of {asian elephant,african elephant,african bush elephant}...
Total matching rows found: 188879
Filtering complete. Result written to samples/lila_bc_image_urls_elephant.csv.




In [32]:
species_samples_dict = {
    'asian elephant': 325,
    'african elephant': 888,
    'african bush elephant': 888,
}

sampled_image_urls = sample_n_images_per_species(samples_file, species_samples_dict, column_to_filter)
sampled_image_urls = add_subset_column(sampled_image_urls, train_ratio)
sampled_samples_file = project_dir + '/' + samples_dir + '/lila_bc_image_urls_' + species + '_sampled.csv'
sampled_image_urls.to_csv(sampled_samples_file, index=False)
!echo "Sampled rows: $(tail -n +2 "$sampled_samples_file" | wc -l)"
sampled_image_urls[column_to_filter].value_counts()

Sampled rows: 2101


Unnamed: 0_level_0,count
common_name,Unnamed: 1_level_1
african elephant,888
african bush elephant,888
asian elephant,325


### Bird

In [33]:
species = 'bird'
column_to_filter = 'common_name'
values_to_filter = 'bird'
samples_file = get_image_urls(project_dir, samples_dir, species, get_image_urls_script, urls_and_labels, column_to_filter, values_to_filter)

Getting image URLs based on common_name equal to any of {bird}...
Total matching rows found: 297686
Filtering complete. Result written to samples/lila_bc_image_urls_bird.csv.




In [34]:
species_samples_dict = {
    'bird': 2100,
}

sampled_image_urls = sample_n_images_per_species(samples_file, species_samples_dict, column_to_filter)
sampled_image_urls = add_subset_column(sampled_image_urls, train_ratio)
sampled_samples_file = project_dir + '/' + samples_dir + '/lila_bc_image_urls_' + species + '_sampled.csv'
sampled_image_urls.to_csv(sampled_samples_file, index=False)
!echo "Sampled rows: $(tail -n +2 "$sampled_samples_file" | wc -l)"
sampled_image_urls[column_to_filter].value_counts()

Sampled rows: 2100


Unnamed: 0_level_0,count
common_name,Unnamed: 1_level_1
bird,2100
