# Sample images

1. Download image URLs and labels from LILA BC
2. For each selected species: sample images, create train test split if applicable

## Setup

In [1]:
from google.colab import drive

import os
import numpy as np
import pandas as pd

import subprocess

In [2]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
project_dir = 'drive/MyDrive/TeraiNet'

# scripts
scripts_dir = project_dir + '/scripts/'
get_image_urls_script = scripts_dir + 'get_image_urls_from_lila_bc.sh'

# local samples dir
samples_dir = 'samples'
!mkdir -p "$samples_dir"

# set parameters
sample_size = 2100 # include some buffer to really end up with 2000 images per class (leopard images availability sets this boundary if classes should be balanced)
train_ratio = 0.8

In [4]:
# remove samples dir from drive if it exists for a fresh start
remove_samples = True
if remove_samples:
  !rm -rf "$project_dir/$samples_dir"
  !mkdir -p "$project_dir/$samples_dir"

In [5]:
def run_get_image_urls_script(get_image_urls_script, urls_and_labels, column_to_filter, values_to_filter, samples_file):

    command = [
        "bash",
        get_image_urls_script,
        urls_and_labels,
        column_to_filter,
        values_to_filter,
        samples_file
    ]

    result = subprocess.run(command, capture_output=True, text=True)

    return result.stdout, result.stderr

In [6]:
def get_image_urls(
    project_dir: str,
    samples_dir: str,
    species: str,
    get_image_urls_script: str,
    urls_and_labels: str,
    column_to_filter: str,
    values_to_filter: str,
    train_ratio: float
  ):

  # get image urls
  samples_file = project_dir + '/' + samples_dir + '/lila_bc_image_urls_' + species + '.csv'
  stdout, stderr = run_get_image_urls_script(get_image_urls_script, urls_and_labels, column_to_filter, values_to_filter, samples_file)
  print(stdout)
  print(stderr)

  # add subset column to samples file
  image_urls = pd.read_csv(samples_file, low_memory=False)
  if species == 'tiger':
    image_urls['subset'] = 'test2'
  else:
    image_urls = add_subset_column(image_urls, train_ratio)
  image_urls.to_csv(samples_file, index=False)

In [7]:
def add_subset_column(df: pd.DataFrame, train_ratio: float, seed: int = 42) -> pd.DataFrame:
    """
    Adds a 'subset' column to the DataFrame, splitting data into 'train' and 'test' subsets.

    Parameters:
        df (pd.DataFrame): The input DataFrame.
        train_ratio (float): The ratio of the 'train' subset (e.g., 0.8 for 80% train).
        seed (int, optional): Seed for reproducibility. Default is 42.

    Returns:
        pd.DataFrame: The DataFrame with an additional 'subset' column.
    """
    if not 0 <= train_ratio <= 1:
        raise ValueError('train_ratio must be between 0 and 1.')

    # Set the random seed for reproducibility
    np.random.seed(seed)

    # Generate random values to assign subsets
    random_values = np.random.rand(len(df))

    # Assign subsets based on the train_ratio
    df['subset'] = np.where(random_values < train_ratio, 'train', 'test')

    return df

## Download image URLs and labels from LILA BC

In [8]:
!wget -O lila_image_urls_and_labels.csv.zip -nc "https://lila.science/public/lila_image_urls_and_labels.csv.zip"

File ‘lila_image_urls_and_labels.csv.zip’ already there; not retrieving.


In [9]:
![ -f lila_image_urls_and_labels.csv ] || unzip lila_image_urls_and_labels.csv.zip

In [10]:
urls_and_labels = 'lila_image_urls_and_labels.csv'

## Inspect species counts

Check taxonomy mapping to find relevant species:
https://lila.science/public/lila-taxonomy-mapping_release.csv

In [11]:
%time
lila_image_urls_and_labels_df = pd.read_csv('lila_image_urls_and_labels.csv', usecols=['common_name'])
lila_image_urls_and_labels_df.shape

CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 8.11 µs


  lila_image_urls_and_labels_df = pd.read_csv('lila_image_urls_and_labels.csv', usecols=['common_name'])


(23591744, 1)

In [12]:
species_list = [
    "tiger",
    "leopard",
    "asian black bear", "american black bear", # not enough images of asian black bear alone
    "dhole", "black-backed jackal", "gray fox", "leopard cat", "mainland leopard cat", "marbled cat", "asian golden cat", # other carnivores (including substitutes, i. e. black-backed jackal and gray fox)
    "deer",
    "wild boar",
    "african buffalo", "cape buffalo", # substitute for gaur
    "white rhinoceros", # substitute for indian rhino
    "asian elephant", "african elephant", "african bush elephant", # not enough images of asian elephant alone
    "bird",
]

In [13]:
species_counts = lila_image_urls_and_labels_df['common_name'].value_counts().reindex(species_list, fill_value=0)
species_counts

Unnamed: 0_level_0,count
common_name,Unnamed: 1_level_1
tiger,321
leopard,2991
asian black bear,1221
american black bear,32854
dhole,185
black-backed jackal,13287
gray fox,19794
leopard cat,266
mainland leopard cat,246
marbled cat,271


## Get image URLs from LILA BC

### Tiger

In [14]:
species = 'tiger'
column_to_filter = 'scientific_name'
values_to_filter = 'panthera tigris'
get_image_urls(project_dir, samples_dir, species, get_image_urls_script, urls_and_labels, column_to_filter, values_to_filter, train_ratio)

Getting image URLs based on scientific_name equal to any of {panthera tigris}...
Total matching rows found: 321
Filtering complete. Result written to drive/MyDrive/TeraiNet/samples/lila_bc_image_urls_tiger.csv.




### Leopard

In [15]:
species = 'leopard'
column_to_filter = 'common_name'
values_to_filter = 'leopard'
get_image_urls(project_dir, samples_dir, species, get_image_urls_script, urls_and_labels, column_to_filter, values_to_filter, train_ratio)

Getting image URLs based on common_name equal to any of {leopard}...
Total matching rows found: 2991
Filtering complete. Result written to drive/MyDrive/TeraiNet/samples/lila_bc_image_urls_leopard.csv.




### Black bear

Include `american black bear` because there are not enough camera trap images of Asian black bears.

In [16]:
species = 'black_bear'
column_to_filter = 'common_name'
values_to_filter = 'asian black bear,american black bear'
get_image_urls(project_dir, samples_dir, species, get_image_urls_script, urls_and_labels, column_to_filter, values_to_filter, train_ratio)

Getting image URLs based on common_name equal to any of {asian black bear,american black bear}...
Total matching rows found: 34075
Filtering complete. Result written to drive/MyDrive/TeraiNet/samples/lila_bc_image_urls_black_bear.csv.




### Other carnivores

Include `dhole,black-backed jackal,gray fox,leopard cat,mainland leopard cat,marbled cat,asian golden cat` to cover a wide range of other carnivores in the Terai ecosystem.

In [17]:
species = 'other_carnivores'
column_to_filter = 'common_name'
values_to_filter = 'dhole,black-backed jackal,gray fox,leopard cat,mainland leopard cat,marbled cat,asian golden cat'
get_image_urls(project_dir, samples_dir, species, get_image_urls_script, urls_and_labels, column_to_filter, values_to_filter, train_ratio)

Getting image URLs based on common_name equal to any of {dhole,black-backed jackal,gray fox,leopard cat,mainland leopard cat,marbled cat,asian golden cat}...
Total matching rows found: 34402
Filtering complete. Result written to drive/MyDrive/TeraiNet/samples/lila_bc_image_urls_other_carnivores.csv.




### Deer

In [18]:
species = 'deer'
column_to_filter = 'common_name'
values_to_filter = 'deer'
get_image_urls(project_dir, samples_dir, species, get_image_urls_script, urls_and_labels, column_to_filter, values_to_filter, train_ratio)

Getting image URLs based on common_name equal to any of {deer}...
Total matching rows found: 360489
Filtering complete. Result written to drive/MyDrive/TeraiNet/samples/lila_bc_image_urls_deer.csv.




### Wild boar

In [19]:
species = 'wild_boar'
column_to_filter = 'common_name'
values_to_filter = 'wild boar'
get_image_urls(project_dir, samples_dir, species, get_image_urls_script, urls_and_labels, column_to_filter, values_to_filter, train_ratio)

Getting image URLs based on common_name equal to any of {wild boar}...
Total matching rows found: 142701
Filtering complete. Result written to drive/MyDrive/TeraiNet/samples/lila_bc_image_urls_wild_boar.csv.




### Buffalo

Use `african buffalo,cape buffalo` because camera trap images of gaur are unavailable.

In [20]:
species = 'buffalo'
column_to_filter = 'common_name'
values_to_filter = 'african buffalo,cape buffalo'
get_image_urls(project_dir, samples_dir, species, get_image_urls_script, urls_and_labels, column_to_filter, values_to_filter, train_ratio)

Getting image URLs based on common_name equal to any of {african buffalo,cape buffalo}...
Total matching rows found: 94114
Filtering complete. Result written to drive/MyDrive/TeraiNet/samples/lila_bc_image_urls_buffalo.csv.




### Rhino

Use `white rhinoceros` because camera trap images of Indian rhinoceros are unavailable.

In [21]:
species = 'rhino'
column_to_filter = 'common_name'
values_to_filter = 'white rhinoceros'
get_image_urls(project_dir, samples_dir, species, get_image_urls_script, urls_and_labels, column_to_filter, values_to_filter, train_ratio)

Getting image URLs based on common_name equal to any of {white rhinoceros}...
Total matching rows found: 7307
Filtering complete. Result written to drive/MyDrive/TeraiNet/samples/lila_bc_image_urls_rhino.csv.




### Elephant

Include `african elephant,african bush elephant` because there are not enough camera trap images of Asian elephants.

In [22]:
species = 'elephant'
column_to_filter = 'common_name'
values_to_filter = 'asian elephant,african elephant,african bush elephant'
get_image_urls(project_dir, samples_dir, species, get_image_urls_script, urls_and_labels, column_to_filter, values_to_filter, train_ratio)

Getting image URLs based on common_name equal to any of {asian elephant,african elephant,african bush elephant}...
Total matching rows found: 188879
Filtering complete. Result written to drive/MyDrive/TeraiNet/samples/lila_bc_image_urls_elephant.csv.




### Bird

In [23]:
species = 'bird'
column_to_filter = 'common_name'
values_to_filter = 'bird'
get_image_urls(project_dir, samples_dir, species, get_image_urls_script, urls_and_labels, column_to_filter, values_to_filter, train_ratio)

Getting image URLs based on common_name equal to any of {bird}...
Total matching rows found: 297686
Filtering complete. Result written to drive/MyDrive/TeraiNet/samples/lila_bc_image_urls_bird.csv.


