# Data preparation

1. Download image URLs and labels from LILA BC
2. For each selected species: sample and download images, create train test split if applicable
3. Run MegaDetector on all images
4. Run mewc-snip on all images
5. Copy snipped images to Drive

## Setup

In [38]:
from google.colab import drive

import numpy as np
import pandas as pd

In [39]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [40]:
project_dir = 'drive/MyDrive/tiger_classification'

scripts_dir = project_dir + '/scripts/'
sample_images_script = scripts_dir + 'sample_images_from_lila_bc.sh'
download_images_script = scripts_dir + 'download_images_from_lila_bc.sh'
prep_amur_tiger_images_script = scripts_dir + 'prep_amur_tiger_images.sh'
run_md_script = scripts_dir + 'run_megadetector.py'
copy_snipped_images_script = scripts_dir + 'copy_snipped_images.sh'

md_dir = project_dir + '/megadetector/'
md_file = md_dir + 'md_v5a.0.0.pt'
md_out_file = 'md_out.json'

images_dir = 'images'

In [41]:
!mkdir -p "$images_dir"

In [5]:
!pip install megadetector udocker



In [6]:
!udocker --allow-root install
!udocker --allow-root pull zaandahl/mewc-snip

Info: downloading layer sha256:1246a535b13ec540da6cc956742d3f57cc885eeb3bc8a13a1acf814cb7dbc408
Info: downloading layer sha256:8ea64ddcde0323c874d115e0a770aae1fa0a6377a445f30791487fc3a1bf0873
Info: downloading layer sha256:4f4fb700ef54461cfa02571ae0db9a0dc1e0cdb5577484a6d75e68dc38e8acc1
Info: downloading layer sha256:e380b201f9cfc7dd5d52f5e806863b14c881e26a6d972a34534ed69642d701b3
Info: downloading layer sha256:dff5477a4a4517d28aa8e484bc04e13f82681e4e4e49d3ba9c403f8baf4ecde8
Info: downloading layer sha256:d230d1ae81f7af7b367d98c0b142f9065fa0ee6f16c5d9e11a1fd59b59d56092
Info: downloading layer sha256:c43b5347d05de26c8823110aaa5fb6d21fd0ae396ffd8ac29296f20c002def9d
Info: downloading layer sha256:41114abec237a1f9dea1684b857a3411a69f4391b3429e2ce8705eb9a19b4203
Info: downloading layer sha256:3a39b83ae0ab6d04f26436dc0a6f76ccd06e12298a0d78962bae8a3bacd8faa8
Info: downloading layer sha256:cdbc978275d45d56de7d042b67015d7d4500dbfa4be00d0bdd1a232d78a275d2
Info: downloading layer sha256:b2392c294

In [7]:
def add_subset_column(df, train_ratio, seed=42):
    """
    Adds a 'subset' column to the DataFrame, splitting data into 'train' and 'test' subsets.

    Parameters:
        df (pd.DataFrame): The input DataFrame.
        train_ratio (float): The ratio of the 'train' subset (e.g., 0.8 for 80% train).
        seed (int, optional): Seed for reproducibility. Default is 42.

    Returns:
        pd.DataFrame: The DataFrame with an additional 'subset' column.
    """
    if not 0 <= train_ratio <= 1:
        raise ValueError("train_ratio must be between 0 and 1.")

    # Set the random seed for reproducibility
    np.random.seed(seed)

    # Generate random values to assign subsets
    random_values = np.random.rand(len(df))

    # Assign subsets based on the train_ratio
    df['subset'] = np.where(random_values < train_ratio, 'train', 'test')

    return df

## Download image URLs and labels from LILA BC

In [8]:
!wget -O lila_image_urls_and_labels.csv.zip -nc "https://lila.science/public/lila_image_urls_and_labels.csv.zip"

File ‘lila_image_urls_and_labels.csv.zip’ already there; not retrieving.


In [9]:
![ -f lila_image_urls_and_labels.csv ] || unzip lila_image_urls_and_labels.csv.zip

Archive:  lila_image_urls_and_labels.csv.zip
  inflating: lila_image_urls_and_labels.csv  


In [10]:
urls_and_labels = 'lila_image_urls_and_labels.csv'

Check taxonomy mapping to find relevant species:
https://lila.science/public/lila-taxonomy-mapping_release.csv

## Sample images

### Tiger

#### LILA BC

In [11]:
# get image urls
species = 'panthera tigris'
samples_file = images_dir + '/' + species.replace(' ', '_') + '_lila_bc.csv'
!time awk -F ',' -v species="$species" 'NR==1 || $10 == species' "$urls_and_labels" > "$samples_file"
!cat "$samples_file" | wc -l


real	0m51.675s
user	0m21.824s
sys	0m4.641s
322


In [12]:
# copy samples file to drive
!cp "$samples_file" "$project_dir/images/$(basename "$samples_file")"

In [13]:
# add subset column to samples file
df = pd.read_csv(samples_file)
df['subset'] = 'test2'
df.to_csv(samples_file, index=False)

In [14]:
# download images
class_number = 1
!time bash "$download_images_script" "$samples_file" "$class_number" "$images_dir"

Downloading images into directory: images...
Error downloading https://storage.googleapis.com/public-datasets-lila/wcs-unzipped/animals/0405/0132.jpg
Error downloading https://storage.googleapis.com/public-datasets-lila/wcs-unzipped/animals/0405/0133.jpg
Error downloading https://storage.googleapis.com/public-datasets-lila/wcs-unzipped/animals/0405/0134.jpg
Error downloading https://storage.googleapis.com/public-datasets-lila/wcs-unzipped/animals/0405/0135.jpg
Error downloading https://storage.googleapis.com/public-datasets-lila/wcs-unzipped/animals/0405/0138.jpg
Total failed downloads: 5
Download complete. Files saved to images.

real	1m3.345s
user	0m3.403s
sys	0m3.033s


#### Amur tiger re-identification challenge

In [15]:
# download, unpack and prep train images
!wget -O "$images_dir/atrw_detection_train.tar.gz" https://storage.googleapis.com/public-datasets-lila/cvwc2019/train/atrw_detection_train.tar.gz
!tar -xzf "$images_dir/atrw_detection_train.tar.gz" -C "$images_dir"
!time bash "$prep_amur_tiger_images_script" "$images_dir/trainval" 1 train "$images_dir"

--2024-11-26 08:38:39--  https://storage.googleapis.com/public-datasets-lila/cvwc2019/train/atrw_detection_train.tar.gz
Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.101.207, 142.251.2.207, 142.250.141.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.101.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2288778168 (2.1G) [application/x-tar]
Saving to: ‘images/atrw_detection_train.tar.gz’


2024-11-26 08:38:52 (169 MB/s) - ‘images/atrw_detection_train.tar.gz’ saved [2288778168/2288778168]

All files moved and renamed. Source directory images/trainval removed.

real	0m10.604s
user	0m2.483s
sys	0m6.714s


In [16]:
# download, unpack and prep test images
!wget -O "$images_dir/atrw_detection_test.tar.gz" https://storage.googleapis.com/public-datasets-lila/cvwc2019/test/atrw_detection_test.tar.gz
!tar -xzf "$images_dir/atrw_detection_test.tar.gz" -C "$images_dir"
!time bash "$prep_amur_tiger_images_script" "$images_dir/test" 1 test "$images_dir"

--2024-11-26 08:39:37--  https://storage.googleapis.com/public-datasets-lila/cvwc2019/test/atrw_detection_test.tar.gz
Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.141.207, 74.125.137.207, 142.250.101.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.141.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1493058467 (1.4G) [application/x-tar]
Saving to: ‘images/atrw_detection_test.tar.gz’


2024-11-26 08:39:49 (126 MB/s) - ‘images/atrw_detection_test.tar.gz’ saved [1493058467/1493058467]

All files moved and renamed. Source directory images/test removed.

real	0m5.483s
user	0m1.368s
sys	0m3.742s


### Lynx

In [17]:
# sample images
column_to_filter = 'scientific_name'
values_to_filter = 'lynx rufus'
sample_size = 1000
!time bash "$sample_images_script" "$urls_and_labels" "$column_to_filter" "$values_to_filter" "$sample_size" "$images_dir"

Sampling 1000 rows with scientific_name equal to any of {lynx rufus} using seed 42...
Sampling complete. Result written to images/lynx_rufus_sample_1000.csv.

real	1m17.611s
user	0m21.687s
sys	0m6.530s


In [18]:
# copy samples file to drive
samples_file = images_dir + '/' + values_to_filter.replace(' ', '_') + '_sample_' + str(sample_size) + '.csv'
!cp "$samples_file" "$project_dir/images/$(basename "$samples_file")"

In [19]:
# add subset column to samples file
df = pd.read_csv(samples_file)
train_ratio = 0.8
df = add_subset_column(df, train_ratio)
df.to_csv(samples_file, index=False)

In [20]:
# download images
class_number = 2
!time bash "$download_images_script" "$samples_file" "$class_number" "$images_dir"

Downloading images into directory: images...
All images successfully downloaded.
Download complete. Files saved to images.

real	3m5.820s
user	0m9.813s
sys	0m8.952s


### Bear

In [21]:
# sample images
column_to_filter = 'scientific_name'
values_to_filter = 'ursus thibetanus'
sample_size = 1000
!time bash "$sample_images_script" "$urls_and_labels" "$column_to_filter" "$values_to_filter" "$sample_size" "$images_dir"

Sampling 1000 rows with scientific_name equal to any of {ursus thibetanus} using seed 42...
Sampling complete. Result written to images/ursus_thibetanus_sample_1000.csv.

real	1m5.136s
user	0m21.816s
sys	0m6.304s


In [22]:
# copy samples file to drive
samples_file = images_dir + '/' + values_to_filter.replace(' ', '_') + '_sample_' + str(sample_size) + '.csv'
!cp "$samples_file" "$project_dir/images/$(basename "$samples_file")"

In [23]:
# add subset column to samples file
df = pd.read_csv(samples_file)
train_ratio = 0.8
df = add_subset_column(df, train_ratio)
df.to_csv(samples_file, index=False)

In [24]:
# download images
class_number = 3
!time bash "$download_images_script" "$samples_file" "$class_number" "$images_dir"

Downloading images into directory: images...
All images successfully downloaded.
Download complete. Files saved to images.

real	3m21.278s
user	0m11.163s
sys	0m12.300s


### Deer

In [25]:
# sample images
column_to_filter = 'common_name'
values_to_filter = 'deer'
sample_size = 1000
!time bash "$sample_images_script" "$urls_and_labels" "$column_to_filter" "$values_to_filter" "$sample_size" "$images_dir"

Sampling 1000 rows with common_name equal to any of {deer} using seed 42...
Sampling complete. Result written to images/deer_sample_1000.csv.

real	0m44.727s
user	0m21.961s
sys	0m5.599s


In [26]:
# copy samples file to drive
samples_file = images_dir + '/' + values_to_filter.replace(' ', '_') + '_sample_' + str(sample_size) + '.csv'
!cp "$samples_file" "$project_dir/images/$(basename "$samples_file")"

In [27]:
# add subset column to samples file
df = pd.read_csv(samples_file)
train_ratio = 0.8
df = add_subset_column(df, train_ratio)
df.to_csv(samples_file, index=False)

In [28]:
# download images
class_number = 4
!time bash "$download_images_script" "$samples_file" "$class_number" "$images_dir"

Downloading images into directory: images...
All images successfully downloaded.
Download complete. Files saved to images.

real	3m11.809s
user	0m10.379s
sys	0m10.552s


### Bird

In [29]:
# sample images
column_to_filter = 'common_name'
values_to_filter = 'bird'
sample_size = 1000
!time bash "$sample_images_script" "$urls_and_labels" "$column_to_filter" "$values_to_filter" "$sample_size" "$images_dir"

Sampling 1000 rows with common_name equal to any of {bird} using seed 42...
Sampling complete. Result written to images/bird_sample_1000.csv.

real	0m51.859s
user	0m21.694s
sys	0m6.382s


In [30]:
# copy samples file to drive
samples_file = images_dir + '/' + values_to_filter.replace(' ', '_') + '_sample_' + str(sample_size) + '.csv'
!cp "$samples_file" "$project_dir/images/$(basename "$samples_file")"

In [31]:
# add subset column to samples file
df = pd.read_csv(samples_file)
train_ratio = 0.8
df = add_subset_column(df, train_ratio)
df.to_csv(samples_file, index=False)

In [32]:
# download images
class_number = 5
!time bash "$download_images_script" "$samples_file" "$class_number" "$images_dir"

Downloading images into directory: images...
All images successfully downloaded.
Download complete. Files saved to images.

real	3m12.107s
user	0m10.442s
sys	0m10.584s


## Run MegaDetector

In [33]:
# run megadetector
!time python "$run_md_script" "$images_dir" "$md_file"

[1;30;43mDie letzten 5000 Zeilen der Streamingausgabe wurden abgeschnitten.[0m
 43% 3740/8734 [10:19<14:07,  5.90it/s]Processing image images/class_1_train_2849.jpg
 43% 3741/8734 [10:19<14:12,  5.86it/s]Processing image images/class_1_train_2850.jpg
 43% 3742/8734 [10:19<14:17,  5.82it/s]Processing image images/class_1_train_2851.jpg
 43% 3743/8734 [10:19<14:24,  5.77it/s]Processing image images/class_1_train_2852.jpg
 43% 3744/8734 [10:19<14:02,  5.93it/s]Processing image images/class_1_train_2853.jpg
 43% 3745/8734 [10:19<13:50,  6.01it/s]Processing image images/class_1_train_2854.jpg
 43% 3746/8734 [10:20<13:46,  6.04it/s]Processing image images/class_1_train_2855.jpg
 43% 3747/8734 [10:20<13:36,  6.11it/s]Processing image images/class_1_train_2856.jpg
 43% 3748/8734 [10:20<13:28,  6.17it/s]Processing image images/class_1_train_2857.jpg
 43% 3749/8734 [10:20<13:23,  6.20it/s]Processing image images/class_1_train_2859.jpg
 43% 3750/8734 [10:20<13:41,  6.07it/s]Processing image ima

In [34]:
# copy megadetector output file to drive
!cp "$images_dir/$md_out_file" "$project_dir/$images_dir/$md_out_file"

## Run mewc-snip

In [35]:
# run mewc-snip
!time udocker --allow-root run \
  --volume "/content/$images_dir":/images \
  zaandahl/mewc-snip

 
 ****************************************************************************** 
 *                                                                            * 
 *               STARTING fc73b157-8e6f-3071-889c-975b6058a3d2                * 
 *                                                                            * 
 ****************************************************************************** 
 executing: python
Processing 8734 images from md_out.json
100% 8734/8734 [17:52<00:00,  8.14it/s]

real	22m56.531s
user	19m13.263s
sys	1m50.264s


## Copy snipped images to Drive

In [36]:
!mkdir -p "$project_dir/$images_dir/train"
!mkdir -p "$project_dir/$images_dir/test"
!mkdir -p "$project_dir/$images_dir/test2"

In [43]:
!time bash "$copy_snipped_images_script" "$images_dir/snips" "$project_dir/$images_dir" 5

Files copied successfully.

real	4m52.596s
user	0m45.824s
sys	1m42.705s
