# Sample and download images

1. Download image URLs and labels from LILA BC
2. For each selected species: sample and download images, create train test split if applicable
3. Copy images to Drive

## Setup

In [1]:
from google.colab import drive

import os
import numpy as np
import pandas as pd

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
project_dir = 'drive/MyDrive/tiger_classification'

# scripts
scripts_dir = project_dir + '/scripts/'
sample_images_script = scripts_dir + 'sample_images_from_lila_bc.sh'
download_images_script = scripts_dir + 'download_images_from_lila_bc.sh'
prep_amur_tiger_images_script = scripts_dir + 'prep_amur_tiger_images.sh'

# local images dir
images_dir = 'images'
!mkdir -p "$images_dir"

# set parameters
sample_size = 6000
train_ratio = 0.8

In [4]:
# drive images dir
remove_images = True
if remove_images:
  !rm -rf "$project_dir/$images_dir"
  !mkdir -p "$project_dir/$images_dir"

In [4]:
def add_subset_column(df, train_ratio, seed=42):
    """
    Adds a 'subset' column to the DataFrame, splitting data into 'train' and 'test' subsets.

    Parameters:
        df (pd.DataFrame): The input DataFrame.
        train_ratio (float): The ratio of the 'train' subset (e.g., 0.8 for 80% train).
        seed (int, optional): Seed for reproducibility. Default is 42.

    Returns:
        pd.DataFrame: The DataFrame with an additional 'subset' column.
    """
    if not 0 <= train_ratio <= 1:
        raise ValueError("train_ratio must be between 0 and 1.")

    # Set the random seed for reproducibility
    np.random.seed(seed)

    # Generate random values to assign subsets
    random_values = np.random.rand(len(df))

    # Assign subsets based on the train_ratio
    df['subset'] = np.where(random_values < train_ratio, 'train', 'test')

    return df

## Download image URLs and labels from LILA BC

In [5]:
!wget -O lila_image_urls_and_labels.csv.zip -nc "https://lila.science/public/lila_image_urls_and_labels.csv.zip"

--2024-11-30 21:26:12--  https://lila.science/public/lila_image_urls_and_labels.csv.zip
Resolving lila.science (lila.science)... 20.83.252.133
Connecting to lila.science (lila.science)|20.83.252.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 457425682 (436M) [application/zip]
Saving to: ‘lila_image_urls_and_labels.csv.zip’


2024-11-30 21:26:22 (44.8 MB/s) - ‘lila_image_urls_and_labels.csv.zip’ saved [457425682/457425682]



In [6]:
![ -f lila_image_urls_and_labels.csv ] || unzip lila_image_urls_and_labels.csv.zip

Archive:  lila_image_urls_and_labels.csv.zip
  inflating: lila_image_urls_and_labels.csv  


In [7]:
urls_and_labels = 'lila_image_urls_and_labels.csv'

Check taxonomy mapping to find relevant species:
https://lila.science/public/lila-taxonomy-mapping_release.csv

## Sample images

### Tiger

#### LILA BC
Don't take a sample but get all tiger images from LILA BC since there are not many.

In [12]:
# get image urls
species = 'panthera tigris'
samples_file = images_dir + '/' + species.replace(' ', '_') + '_lila_bc.csv'
!time awk -F ',' -v species="$species" 'NR==1 || $10 == species' "$urls_and_labels" > "$samples_file"
!cat "$samples_file" | wc -l


real	0m48.924s
user	0m27.567s
sys	0m5.447s
322


In [13]:
# copy samples file to drive
!cp "$samples_file" "$project_dir/$images_dir/$(basename "$samples_file")"

In [14]:
# add subset column to samples file
df = pd.read_csv(samples_file)
df['subset'] = 'test2'
df.to_csv(samples_file, index=False)

In [15]:
# download images
class_number = 1
!time bash "$download_images_script" "$samples_file" "$class_number" "$images_dir"

Downloading images into directory: images...
Error downloading https://storage.googleapis.com/public-datasets-lila/wcs-unzipped/animals/0405/0132.jpg
Error downloading https://storage.googleapis.com/public-datasets-lila/wcs-unzipped/animals/0405/0133.jpg
Error downloading https://storage.googleapis.com/public-datasets-lila/wcs-unzipped/animals/0405/0134.jpg
Error downloading https://storage.googleapis.com/public-datasets-lila/wcs-unzipped/animals/0405/0135.jpg
Error downloading https://storage.googleapis.com/public-datasets-lila/wcs-unzipped/animals/0405/0138.jpg
Total failed downloads: 5
Download complete. Files saved to images.

real	0m45.545s
user	0m4.340s
sys	0m3.876s


In [16]:
# remove worst and wrongly labeled tiger images from test2
# this kind of quality control is only feasible because there are ~300 images
images_to_remove = [
    'class_1_test2_46.jpg',
    'class_1_test2_49.jpg',
    'class_1_test2_91.jpg',
    'class_1_test2_134.jpg',
    'class_1_test2_135.jpg',
    'class_1_test2_136.jpg',
    'class_1_test2_158.jpg', # goat
    'class_1_test2_159.jpg', # goat
    'class_1_test2_220.jpg',
    'class_1_test2_311.jpg'
]

for file_name in images_to_remove:
    file_path = os.path.join(images_dir, file_name)
    try:
        os.remove(file_path)
        print(f'Deleted: {file_path}')
    except FileNotFoundError:
        print(f'File not found: {file_path}')
    except PermissionError:
        print(f'Permission denied: {file_path}')
    except Exception as e:
        print(f'Error deleting {file_path}: {e}')

Deleted: images/class_1_test2_46.jpg
Deleted: images/class_1_test2_49.jpg
Deleted: images/class_1_test2_91.jpg
Deleted: images/class_1_test2_134.jpg
Deleted: images/class_1_test2_135.jpg
Deleted: images/class_1_test2_136.jpg
Deleted: images/class_1_test2_158.jpg
Deleted: images/class_1_test2_159.jpg
Deleted: images/class_1_test2_220.jpg
Deleted: images/class_1_test2_311.jpg


#### Amur tiger re-identification challenge
These images are a substitute for "real" camera trap images because openly available camera trap images of tigers are scarce.

In [6]:
# download, unpack and prep train images
file_name = 'atrw_detection_train.tar.gz'
!wget -O "$images_dir/$file_name" https://storage.googleapis.com/public-datasets-lila/cvwc2019/train/atrw_detection_train.tar.gz
!tar -xzf "$images_dir/$file_name" -C "$images_dir"
!time bash "$prep_amur_tiger_images_script" "$images_dir/trainval" 1 train "$images_dir"
!rm "$images_dir/$file_name"

--2024-11-30 18:45:29--  https://storage.googleapis.com/public-datasets-lila/cvwc2019/train/atrw_detection_train.tar.gz
Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.31.207, 142.251.111.207, 142.251.16.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.31.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2288778168 (2.1G) [application/x-tar]
Saving to: ‘images/atrw_detection_train.tar.gz’


2024-11-30 18:45:48 (117 MB/s) - ‘images/atrw_detection_train.tar.gz’ saved [2288778168/2288778168]

All files moved and renamed. Source directory images/trainval removed.

real	0m13.357s
user	0m3.082s
sys	0m8.604s


In [7]:
# download, unpack and prep test images
file_name = 'atrw_detection_test.tar.gz'
!wget -O "$images_dir/$file_name" https://storage.googleapis.com/public-datasets-lila/cvwc2019/test/atrw_detection_test.tar.gz
!tar -xzf "$images_dir/$file_name" -C "$images_dir"
!time bash "$prep_amur_tiger_images_script" "$images_dir/test" 1 test "$images_dir"
!rm "$images_dir/$file_name"

--2024-11-30 18:46:43--  https://storage.googleapis.com/public-datasets-lila/cvwc2019/test/atrw_detection_test.tar.gz
Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.167.207, 142.251.179.207, 64.233.180.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.167.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1493058467 (1.4G) [application/x-tar]
Saving to: ‘images/atrw_detection_test.tar.gz’


2024-11-30 18:46:54 (132 MB/s) - ‘images/atrw_detection_test.tar.gz’ saved [1493058467/1493058467]

All files moved and renamed. Source directory images/test removed.

real	0m7.549s
user	0m1.764s
sys	0m4.953s


### Lynx

In [9]:
# sample images
column_to_filter = 'scientific_name'
values_to_filter = 'lynx rufus'
!time bash "$sample_images_script" "$urls_and_labels" "$column_to_filter" "$values_to_filter" "$sample_size" "$images_dir"

Sampling 6000 rows with scientific_name equal to any of {lynx rufus} using seed 42...
Sampling complete. Result written to images/lynx_rufus_sample_6000.csv.

real	0m47.374s
user	0m29.666s
sys	0m5.626s


In [10]:
# copy samples file to drive
samples_file = images_dir + '/' + values_to_filter.replace(' ', '_') + '_sample_' + str(sample_size) + '.csv'
!cp "$samples_file" "$project_dir/$images_dir/$(basename "$samples_file")"

In [11]:
# add subset column to samples file
df = pd.read_csv(samples_file)
df = add_subset_column(df, train_ratio)
df.to_csv(samples_file, index=False)

In [12]:
# download images
class_number = 2
!time bash "$download_images_script" "$samples_file" "$class_number" "$images_dir"

Downloading images into directory: images...
All images successfully downloaded.
Download complete. Files saved to images.

real	12m50.036s
user	1m15.890s
sys	1m10.419s


### Black bear

In [13]:
# sample images
column_to_filter = 'common_name'
values_to_filter = 'american black bear,asian black bear'
!time bash "$sample_images_script" "$urls_and_labels" "$column_to_filter" "$values_to_filter" "$sample_size" "$images_dir"

Sampling 6000 rows with common_name equal to any of {american black bear,asian black bear} using seed 42...
Sampling complete. Result written to images/american_black_bear_asian_black_bear_sample_6000.csv.

real	1m20.660s
user	0m34.074s
sys	0m10.811s


In [14]:
# copy samples file to drive
samples_file = images_dir + '/' + values_to_filter.replace(' ', '_').replace(',', '_') + '_sample_' + str(sample_size) + '.csv'
!cp "$samples_file" "$project_dir/$images_dir/$(basename "$samples_file")"

In [15]:
# add subset column to samples file
df = pd.read_csv(samples_file)
df = add_subset_column(df, train_ratio)
df.to_csv(samples_file, index=False)

In [16]:
# download images
class_number = 3
!time bash "$download_images_script" "$samples_file" "$class_number" "$images_dir"

Downloading images into directory: images...
All images successfully downloaded.
Download complete. Files saved to images.

real	12m29.227s
user	1m18.562s
sys	1m17.347s


### Deer

In [8]:
# sample images
column_to_filter = 'common_name'
values_to_filter = 'deer'
!time bash "$sample_images_script" "$urls_and_labels" "$column_to_filter" "$values_to_filter" "$sample_size" "$images_dir"

Sampling 6000 rows with common_name equal to any of {deer} using seed 42...
Sampling complete. Result written to images/deer_sample_6000.csv.

real	0m56.226s
user	0m30.462s
sys	0m7.085s


In [9]:
# copy samples file to drive
samples_file = images_dir + '/' + values_to_filter.replace(' ', '_') + '_sample_' + str(sample_size) + '.csv'
!cp "$samples_file" "$project_dir/$images_dir/$(basename "$samples_file")"

In [10]:
# add subset column to samples file
df = pd.read_csv(samples_file)
df = add_subset_column(df, train_ratio)
df.to_csv(samples_file, index=False)

In [11]:
# download images
class_number = 4
!time bash "$download_images_script" "$samples_file" "$class_number" "$images_dir"

Downloading images into directory: images...
All images successfully downloaded.
Download complete. Files saved to images.

real	12m30.281s
user	1m19.852s
sys	1m20.722s


### Bird

In [15]:
# sample images
column_to_filter = 'common_name'
values_to_filter = 'bird'
!time bash "$sample_images_script" "$urls_and_labels" "$column_to_filter" "$values_to_filter" "$sample_size" "$images_dir"

Sampling 6000 rows with common_name equal to any of {bird} using seed 42...
Sampling complete. Result written to images/bird_sample_6000.csv.

real	1m24.561s
user	0m27.757s
sys	0m8.526s


In [23]:
# copy samples file to drive
samples_file = images_dir + '/' + values_to_filter.replace(' ', '_') + '_sample_' + str(sample_size) + '.csv'
!cp "$samples_file" "$project_dir/$images_dir/$(basename "$samples_file")"

In [17]:
# add subset column to samples file
df = pd.read_csv(samples_file)
df = add_subset_column(df, train_ratio)
df.to_csv(samples_file, index=False)

In [18]:
# download images
class_number = 5
!time bash "$download_images_script" "$samples_file" "$class_number" "$images_dir"

Downloading images into directory: images...
All images successfully downloaded.
Download complete. Files saved to images.

real	11m48.074s
user	1m16.604s
sys	1m16.030s


## Copy images to Drive

In [24]:
!time cp "$images_dir/"class_5* "$project_dir/$images_dir"


real	2m14.799s
user	0m0.304s
sys	0m7.433s


In [25]:
!ls "$project_dir/$images_dir" | wc -l

6001
