# Data preparation

1. Download image URLs and labels from LILA BC
2. For each selected species: sample and download images, create train test split if applicable
3. Run MegaDetector on all images
4. Run mewc-snip on all images
5. Copy snipped images to Drive

## Setup

In [1]:
from google.colab import drive

import os
import numpy as np
import pandas as pd

In [2]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!pip install megadetector udocker



In [4]:
!udocker --allow-root install
!udocker --allow-root pull zaandahl/mewc-snip

Info: downloading layer sha256:1246a535b13ec540da6cc956742d3f57cc885eeb3bc8a13a1acf814cb7dbc408
Info: downloading layer sha256:8ea64ddcde0323c874d115e0a770aae1fa0a6377a445f30791487fc3a1bf0873
Info: downloading layer sha256:4f4fb700ef54461cfa02571ae0db9a0dc1e0cdb5577484a6d75e68dc38e8acc1
Info: downloading layer sha256:e380b201f9cfc7dd5d52f5e806863b14c881e26a6d972a34534ed69642d701b3
Info: downloading layer sha256:dff5477a4a4517d28aa8e484bc04e13f82681e4e4e49d3ba9c403f8baf4ecde8
Info: downloading layer sha256:d230d1ae81f7af7b367d98c0b142f9065fa0ee6f16c5d9e11a1fd59b59d56092
Info: downloading layer sha256:c43b5347d05de26c8823110aaa5fb6d21fd0ae396ffd8ac29296f20c002def9d
Info: downloading layer sha256:41114abec237a1f9dea1684b857a3411a69f4391b3429e2ce8705eb9a19b4203
Info: downloading layer sha256:3a39b83ae0ab6d04f26436dc0a6f76ccd06e12298a0d78962bae8a3bacd8faa8
Info: downloading layer sha256:cdbc978275d45d56de7d042b67015d7d4500dbfa4be00d0bdd1a232d78a275d2
Info: downloading layer sha256:b2392c294

In [5]:
def add_subset_column(df, train_ratio, seed=42):
    """
    Adds a 'subset' column to the DataFrame, splitting data into 'train' and 'test' subsets.

    Parameters:
        df (pd.DataFrame): The input DataFrame.
        train_ratio (float): The ratio of the 'train' subset (e.g., 0.8 for 80% train).
        seed (int, optional): Seed for reproducibility. Default is 42.

    Returns:
        pd.DataFrame: The DataFrame with an additional 'subset' column.
    """
    if not 0 <= train_ratio <= 1:
        raise ValueError("train_ratio must be between 0 and 1.")

    # Set the random seed for reproducibility
    np.random.seed(seed)

    # Generate random values to assign subsets
    random_values = np.random.rand(len(df))

    # Assign subsets based on the train_ratio
    df['subset'] = np.where(random_values < train_ratio, 'train', 'test')

    return df

In [6]:
project_dir = 'drive/MyDrive/tiger_classification'

scripts_dir = project_dir + '/scripts/'
sample_images_script = scripts_dir + 'sample_images_from_lila_bc.sh'
download_images_script = scripts_dir + 'download_images_from_lila_bc.sh'
prep_amur_tiger_images_script = scripts_dir + 'prep_amur_tiger_images.sh'
run_md_script = scripts_dir + 'run_megadetector.py'
copy_snipped_images_script = scripts_dir + 'copy_snipped_images.sh'

md_dir = project_dir + '/megadetector/'
md_file = md_dir + 'md_v5a.0.0.pt'
md_out_file = 'md_out.json'

images_dir = 'images'
!mkdir -p "$images_dir"

In [7]:
remove_images = True
if remove_images:
  !rm -rf "$project_dir/$images_dir"
  !mkdir -p "$project_dir/$images_dir"

## Download image URLs and labels from LILA BC

In [None]:
!wget -O lila_image_urls_and_labels.csv.zip -nc "https://lila.science/public/lila_image_urls_and_labels.csv.zip"

--2024-11-29 18:23:52--  https://lila.science/public/lila_image_urls_and_labels.csv.zip
Resolving lila.science (lila.science)... 20.83.252.133
Connecting to lila.science (lila.science)|20.83.252.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 457425682 (436M) [application/zip]
Saving to: ‘lila_image_urls_and_labels.csv.zip’


In [8]:
![ -f lila_image_urls_and_labels.csv ] || unzip lila_image_urls_and_labels.csv.zip

Archive:  lila_image_urls_and_labels.csv.zip
  inflating: lila_image_urls_and_labels.csv  


In [9]:
urls_and_labels = 'lila_image_urls_and_labels.csv'

Check taxonomy mapping to find relevant species:
https://lila.science/public/lila-taxonomy-mapping_release.csv

## Sample images

### Tiger

#### LILA BC
Don't take a sample but get all tiger images from LILA BC since there are not many.

In [10]:
# get image urls
species = 'panthera tigris'
samples_file = images_dir + '/' + species.replace(' ', '_') + '_lila_bc.csv'
!time awk -F ',' -v species="$species" 'NR==1 || $10 == species' "$urls_and_labels" > "$samples_file"
!cat "$samples_file" | wc -l



real	1m35.277s
user	0m23.504s
sys	0m4.678s
322


In [11]:
# copy samples file to drive
!cp "$samples_file" "$project_dir/$images_dir/$(basename "$samples_file")"

In [12]:
# add subset column to samples file
df = pd.read_csv(samples_file)
df['subset'] = 'test2'
df.to_csv(samples_file, index=False)

In [13]:
# download images
class_number = 1
!time bash "$download_images_script" "$samples_file" "$class_number" "$images_dir"

Downloading images into directory: images...
Error downloading https://storage.googleapis.com/public-datasets-lila/wcs-unzipped/animals/0405/0132.jpg
Error downloading https://storage.googleapis.com/public-datasets-lila/wcs-unzipped/animals/0405/0133.jpg
Error downloading https://storage.googleapis.com/public-datasets-lila/wcs-unzipped/animals/0405/0134.jpg
Error downloading https://storage.googleapis.com/public-datasets-lila/wcs-unzipped/animals/0405/0135.jpg
Error downloading https://storage.googleapis.com/public-datasets-lila/wcs-unzipped/animals/0405/0138.jpg
Total failed downloads: 5
Download complete. Files saved to images.

real	0m41.339s
user	0m3.375s
sys	0m3.259s


In [47]:
# remove worst and wrongly labeled tiger images from test2
# this kind of quality control is only feasible because there are ~300 images
images_to_remove = [
    'class_1_test2_46.jpg',
    'class_1_test2_49.jpg',
    'class_1_test2_91.jpg',
    'class_1_test2_134.jpg',
    'class_1_test2_135.jpg',
    'class_1_test2_136.jpg',
    'class_1_test2_158.jpg', # goat
    'class_1_test2_159.jpg', # goat
    'class_1_test2_220.jpg',
    'class_1_test2_311.jpg'
]

for file_name in images_to_remove:
    file_path = os.path.join(images_dir, file_name)
    try:
        os.remove(file_path)
        print(f'Deleted: {file_path}')
    except FileNotFoundError:
        print(f'File not found: {file_path}')
    except PermissionError:
        print(f'Permission denied: {file_path}')
    except Exception as e:
        print(f'Error deleting {file_path}: {e}')

Deleted: images/class_1_test2_46.jpg
Deleted: images/class_1_test2_49.jpg
Deleted: images/class_1_test2_91.jpg
Deleted: images/class_1_test2_134.jpg
Deleted: images/class_1_test2_135.jpg
Deleted: images/class_1_test2_136.jpg
Deleted: images/class_1_test2_158.jpg
Deleted: images/class_1_test2_159.jpg
Deleted: images/class_1_test2_220.jpg
Deleted: images/class_1_test2_311.jpg


#### Amur tiger re-identification challenge
These images are a substitute for "real" camera trap images because openly available camera trap images of tigers are scarce.

In [14]:
# download, unpack and prep train images
!wget -O "$images_dir/atrw_detection_train.tar.gz" https://storage.googleapis.com/public-datasets-lila/cvwc2019/train/atrw_detection_train.tar.gz
!tar -xzf "$images_dir/atrw_detection_train.tar.gz" -C "$images_dir"
!time bash "$prep_amur_tiger_images_script" "$images_dir/trainval" 1 train "$images_dir"

--2024-11-29 18:29:30--  https://storage.googleapis.com/public-datasets-lila/cvwc2019/train/atrw_detection_train.tar.gz
Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.197.207, 74.125.135.207, 74.125.142.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.197.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2288778168 (2.1G) [application/x-tar]
Saving to: ‘images/atrw_detection_train.tar.gz’


2024-11-29 18:29:47 (131 MB/s) - ‘images/atrw_detection_train.tar.gz’ saved [2288778168/2288778168]

All files moved and renamed. Source directory images/trainval removed.

real	0m12.614s
user	0m2.667s
sys	0m6.562s


In [15]:
# download, unpack and prep test images
!wget -O "$images_dir/atrw_detection_test.tar.gz" https://storage.googleapis.com/public-datasets-lila/cvwc2019/test/atrw_detection_test.tar.gz
!tar -xzf "$images_dir/atrw_detection_test.tar.gz" -C "$images_dir"
!time bash "$prep_amur_tiger_images_script" "$images_dir/test" 1 test "$images_dir"

--2024-11-29 18:30:39--  https://storage.googleapis.com/public-datasets-lila/cvwc2019/test/atrw_detection_test.tar.gz
Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.99.207, 142.250.107.207, 142.251.188.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.99.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1493058467 (1.4G) [application/x-tar]
Saving to: ‘images/atrw_detection_test.tar.gz’


2024-11-29 18:30:50 (143 MB/s) - ‘images/atrw_detection_test.tar.gz’ saved [1493058467/1493058467]

All files moved and renamed. Source directory images/test removed.

real	0m6.924s
user	0m1.526s
sys	0m3.742s


### Lynx

In [16]:
# sample images
column_to_filter = 'scientific_name'
values_to_filter = 'lynx rufus'
sample_size = 2000
!time bash "$sample_images_script" "$urls_and_labels" "$column_to_filter" "$values_to_filter" "$sample_size" "$images_dir"

Sampling 2000 rows with scientific_name equal to any of {lynx rufus} using seed 42...
Sampling complete. Result written to images/lynx_rufus_sample_2000.csv.

real	0m33.237s
user	0m21.729s
sys	0m4.543s


In [17]:
# copy samples file to drive
samples_file = images_dir + '/' + values_to_filter.replace(' ', '_') + '_sample_' + str(sample_size) + '.csv'
!cp "$samples_file" "$project_dir/$images_dir/$(basename "$samples_file")"

In [18]:
# add subset column to samples file
df = pd.read_csv(samples_file)
train_ratio = 0.7
df = add_subset_column(df, train_ratio)
df.to_csv(samples_file, index=False)

In [19]:
# download images
class_number = 2
!time bash "$download_images_script" "$samples_file" "$class_number" "$images_dir"

Downloading images into directory: images...
All images successfully downloaded.
Download complete. Files saved to images.

real	3m55.796s
user	0m20.066s
sys	0m18.065s


### Black bear

In [20]:
# sample images
column_to_filter = 'common_name'
values_to_filter = 'american black bear,asian black bear'
sample_size = 2000
!time bash "$sample_images_script" "$urls_and_labels" "$column_to_filter" "$values_to_filter" "$sample_size" "$images_dir"

Sampling 2000 rows with common_name equal to any of {american black bear,asian black bear} using seed 42...
Sampling complete. Result written to images/american_black_bear_asian_black_bear_sample_2000.csv.

real	0m34.847s
user	0m23.291s
sys	0m4.436s


In [21]:
# copy samples file to drive
samples_file = images_dir + '/' + values_to_filter.replace(' ', '_').replace(',', '_') + '_sample_' + str(sample_size) + '.csv'
!cp "$samples_file" "$project_dir/$images_dir/$(basename "$samples_file")"

In [22]:
# add subset column to samples file
df = pd.read_csv(samples_file)
train_ratio = 0.7
df = add_subset_column(df, train_ratio)
df.to_csv(samples_file, index=False)

In [23]:
# download images
class_number = 3
!time bash "$download_images_script" "$samples_file" "$class_number" "$images_dir"

Downloading images into directory: images...
All images successfully downloaded.
Download complete. Files saved to images.

real	3m50.649s
user	0m20.496s
sys	0m19.143s


### Deer

In [24]:
# sample images
column_to_filter = 'common_name'
values_to_filter = 'deer'
sample_size = 2000
!time bash "$sample_images_script" "$urls_and_labels" "$column_to_filter" "$values_to_filter" "$sample_size" "$images_dir"

Sampling 2000 rows with common_name equal to any of {deer} using seed 42...
Sampling complete. Result written to images/deer_sample_2000.csv.

real	0m50.766s
user	0m21.605s
sys	0m6.490s


In [25]:
# copy samples file to drive
samples_file = images_dir + '/' + values_to_filter.replace(' ', '_') + '_sample_' + str(sample_size) + '.csv'
!cp "$samples_file" "$project_dir/$images_dir/$(basename "$samples_file")"

In [26]:
# add subset column to samples file
df = pd.read_csv(samples_file)
train_ratio = 0.7
df = add_subset_column(df, train_ratio)
df.to_csv(samples_file, index=False)

In [27]:
# download images
class_number = 4
!time bash "$download_images_script" "$samples_file" "$class_number" "$images_dir"

Downloading images into directory: images...
All images successfully downloaded.
Download complete. Files saved to images.

real	4m12.229s
user	0m21.552s
sys	0m21.932s


### Bird

In [28]:
# sample images
column_to_filter = 'common_name'
values_to_filter = 'bird'
sample_size = 2000
!time bash "$sample_images_script" "$urls_and_labels" "$column_to_filter" "$values_to_filter" "$sample_size" "$images_dir"

Sampling 2000 rows with common_name equal to any of {bird} using seed 42...
Sampling complete. Result written to images/bird_sample_2000.csv.

real	0m42.027s
user	0m23.770s
sys	0m5.360s


In [29]:
# copy samples file to drive
samples_file = images_dir + '/' + values_to_filter.replace(' ', '_') + '_sample_' + str(sample_size) + '.csv'
!cp "$samples_file" "$project_dir/$images_dir/$(basename "$samples_file")"

In [30]:
# add subset column to samples file
df = pd.read_csv(samples_file)
train_ratio = 0.7
df = add_subset_column(df, train_ratio)
df.to_csv(samples_file, index=False)

In [31]:
# download images
class_number = 5
!time bash "$download_images_script" "$samples_file" "$class_number" "$images_dir"

Downloading images into directory: images...
All images successfully downloaded.
Download complete. Files saved to images.

real	4m5.901s
user	0m21.103s
sys	0m21.665s


## Run MegaDetector

In [32]:
# run megadetector
!time python "$run_md_script" "$images_dir" "$md_file"

[1;30;43mDie letzten 5000 Zeilen der Streamingausgabe wurden abgeschnitten.[0m
 61% 7740/12734 [23:01<17:07,  4.86it/s]Processing image images/class_3_train_1539.jpg
 61% 7741/12734 [23:01<17:03,  4.88it/s]Processing image images/class_3_train_154.jpg
 61% 7742/12734 [23:01<17:08,  4.85it/s]Processing image images/class_3_train_1540.jpg
 61% 7743/12734 [23:01<17:23,  4.79it/s]Processing image images/class_3_train_1542.JPG
 61% 7744/12734 [23:02<18:06,  4.59it/s]Processing image images/class_3_train_1543.JPG
 61% 7745/12734 [23:02<17:38,  4.71it/s]Processing image images/class_3_train_1544.jpg
 61% 7746/12734 [23:02<17:28,  4.76it/s]Processing image images/class_3_train_1546.jpg
 61% 7747/12734 [23:02<17:16,  4.81it/s]Processing image images/class_3_train_1548.jpg
 61% 7748/12734 [23:02<17:36,  4.72it/s]Processing image images/class_3_train_1549.jpg
 61% 7749/12734 [23:03<17:36,  4.72it/s]Processing image images/class_3_train_1550.JPG
 61% 7750/12734 [23:03<17:34,  4.73it/s]Processing

In [33]:
# copy megadetector output file to drive
!cp "$images_dir/$md_out_file" "$project_dir/$images_dir/$md_out_file"

## Run mewc-snip

In [34]:
# run mewc-snip
!time udocker --allow-root run \
  --volume "/content/$images_dir":/images \
  zaandahl/mewc-snip

 
 ****************************************************************************** 
 *                                                                            * 
 *               STARTING 6ec37875-6bc7-3dd1-9633-01e446d71570                * 
 *                                                                            * 
 ****************************************************************************** 
 executing: python
Processing 12734 images from md_out.json
100% 12734/12734 [24:15<00:00,  8.75it/s]

real	29m39.591s
user	24m19.505s
sys	2m5.280s


## Copy snipped images to Drive

In [36]:
# create target directory structure
!mkdir -p "$project_dir/$images_dir/train"
!mkdir -p "$project_dir/$images_dir/test"
!mkdir -p "$project_dir/$images_dir/test2"

In [37]:
# copy snipped images to drive
!time bash "$copy_snipped_images_script" "$images_dir/snips" "$project_dir/$images_dir" 5

Files copied successfully.

real	8m42.068s
user	1m7.714s
sys	2m56.842s


In [38]:
!ls "$project_dir/$images_dir/train/class_1" | wc -l
!ls "$project_dir/$images_dir/train/class_2" | wc -l
!ls "$project_dir/$images_dir/train/class_3" | wc -l
!ls "$project_dir/$images_dir/train/class_4" | wc -l
!ls "$project_dir/$images_dir/train/class_5" | wc -l

4896
1451
1667
2101
1918


In [39]:
!ls "$project_dir/$images_dir/test/class_1" | wc -l
!ls "$project_dir/$images_dir/test/class_2" | wc -l
!ls "$project_dir/$images_dir/test/class_3" | wc -l
!ls "$project_dir/$images_dir/test/class_4" | wc -l
!ls "$project_dir/$images_dir/test/class_5" | wc -l

2825
626
677
920
842


In [48]:
!ls "$project_dir/$images_dir/test2/class_1" | wc -l

307
