# Data preparation

Steps
1. Download image URLs and labels from LILA BC
2. For each species: sample and download images, create train test split if applicable
3. Run MegaDetector on all images
4. Run mewc-snip on all images
5. Copy snipped images to Drive

## Setup

In [1]:
from google.colab import drive

import numpy as np
import pandas as pd

In [None]:
drive.mount('/content/drive')

In [3]:
project_dir = 'drive/MyDrive/tiger_detection'

scripts_dir = project_dir + '/scripts/'
sample_images_script = scripts_dir + 'sample_images_from_lila_bc.sh'
download_images_script = scripts_dir + 'download_images_from_lila_bc.sh'
prep_amur_tiger_images_script = scripts_dir + 'prep_amur_tiger_images.sh'
run_md_script = scripts_dir + 'run_megadetector.py'
copy_snipped_images_script = scripts_dir + 'copy_snipped_images.sh'

md_dir = project_dir + '/megadetector/'
md_file = md_dir + 'md_v5a.0.0.pt'
md_out_file = 'md_out.json'

images_dir = 'images'

In [4]:
!mkdir -p "$images_dir"

In [None]:
!pip install megadetector udocker

In [None]:
!udocker --allow-root install
!udocker --allow-root pull zaandahl/mewc-snip

In [17]:
def add_subset_column(df, train_ratio, seed=42):
    """
    Adds a 'subset' column to the DataFrame, splitting data into 'train' and 'test' subsets.

    Parameters:
        df (pd.DataFrame): The input DataFrame.
        train_ratio (float): The ratio of the 'train' subset (e.g., 0.8 for 80% train).
        seed (int, optional): Seed for reproducibility. Default is 42.

    Returns:
        pd.DataFrame: The DataFrame with an additional 'subset' column.
    """
    if not 0 <= train_ratio <= 1:
        raise ValueError("train_ratio must be between 0 and 1.")

    # Set the random seed for reproducibility
    np.random.seed(seed)

    # Generate random values to assign subsets
    random_values = np.random.rand(len(df))

    # Assign subsets based on the train_ratio
    df['subset'] = np.where(random_values < train_ratio, 'train', 'test')

    return df

## Download image URLs and labels from LILA BC

In [None]:
!wget -O lila_image_urls_and_labels.csv.zip -nc "https://lila.science/public/lila_image_urls_and_labels.csv.zip"

In [None]:
![ -f lila_image_urls_and_labels.csv ] || unzip lila_image_urls_and_labels.csv.zip

In [7]:
urls_and_labels = 'lila_image_urls_and_labels.csv'

Check taxonomy mapping to find relevant species:
https://lila.science/public/lila-taxonomy-mapping_release.csv

## Sample images

### Tiger

#### LILA BC

In [None]:
# get image urls
species = 'panthera tigris'
samples_file = images_dir + '/' + species.replace(' ', '_') + '_lila_bc.csv'
!time awk -F ',' -v species="$species" 'NR==1 || $10 == species' "$urls_and_labels" > "$samples_file"
!cat "$samples_file" | wc -l

In [9]:
# copy samples file to drive
!cp "$samples_file" "$project_dir/images/$(basename "$samples_file")"

In [10]:
# add subset column to samples file
df = pd.read_csv(samples_file)
df['subset'] = 'test2'
df.to_csv(samples_file, index=False)

In [None]:
# download images
class_number = 1
!time bash "$download_images_script" "$samples_file" "$class_number" "$images_dir"

#### Amur tiger re-identification challenge

In [12]:
# download, unpack and prep train images
!wget -O "$images_dir/atrw_detection_train.tar.gz" https://storage.googleapis.com/public-datasets-lila/cvwc2019/train/atrw_detection_train.tar.gz
!tar -xzf "$images_dir/atrw_detection_train.tar.gz" -C "$images_dir"
!time bash "$prep_amur_tiger_images_script" "$images_dir/trainval" 1 train "$images_dir"

In [13]:
# download, unpack and prep test images
!wget -O "$images_dir/atrw_detection_test.tar.gz" https://storage.googleapis.com/public-datasets-lila/cvwc2019/test/atrw_detection_test.tar.gz
!tar -xzf "$images_dir/atrw_detection_test.tar.gz" -C "$images_dir"
!time bash "$prep_amur_tiger_images_script" "$images_dir/test" 1 test "$images_dir"

### Lynx

In [None]:
# sample images
column_to_filter = 'scientific_name'
values_to_filter = 'lynx rufus'
sample_size = 1000
!time bash "$sample_images_script" "$urls_and_labels" "$column_to_filter" "$values_to_filter" "$sample_size" "$images_dir"

In [15]:
# copy samples file to drive
samples_file = images_dir + '/' + values_to_filter.replace(' ', '_') + '_sample_' + str(sample_size) + '.csv'
!cp "$samples_file" "$project_dir/images/$(basename "$samples_file")"

In [18]:
# add subset column to samples file
df = pd.read_csv(samples_file)
train_ratio = 0.8
df = add_subset_column(df, train_ratio)
df.to_csv(samples_file, index=False)

In [None]:
# download images
class_number = 2
!time bash "$download_images_script" "$samples_file" "$class_number" "$images_dir"

### Bear

In [None]:
# sample images
column_to_filter = 'scientific_name'
values_to_filter = 'ursus thibetanus'
sample_size = 1000
!time bash "$sample_images_script" "$urls_and_labels" "$column_to_filter" "$values_to_filter" "$sample_size" "$images_dir"

In [None]:
# copy samples file to drive
samples_file = images_dir + '/' + values_to_filter.replace(' ', '_') + '_sample_' + str(sample_size) + '.csv'
!cp "$samples_file" "$project_dir/images/$(basename "$samples_file")"

In [None]:
# add subset column to samples file
df = pd.read_csv(samples_file)
train_ratio = 0.8
df = add_subset_column(df, train_ratio)
df.to_csv(samples_file, index=False)

In [None]:
# download images
class_number = 3
!time bash "$download_images_script" "$samples_file" "$class_number" "$images_dir"

### Deer

In [None]:
# sample images
column_to_filter = 'common_name'
values_to_filter = 'deer'
sample_size = 1000
!time bash "$sample_images_script" "$urls_and_labels" "$column_to_filter" "$values_to_filter" "$sample_size" "$images_dir"

In [26]:
# copy samples file to drive
samples_file = images_dir + '/' + values_to_filter.replace(' ', '_') + '_sample_' + str(sample_size) + '.csv'
!cp "$samples_file" "$project_dir/images/$(basename "$samples_file")"

In [None]:
# add subset column to samples file
df = pd.read_csv(samples_file)
train_ratio = 0.8
df = add_subset_column(df, train_ratio)
df.to_csv(samples_file, index=False)

In [None]:
# download images
class_number = 4
!time bash "$download_images_script" "$samples_file" "$class_number" "$images_dir"

### Bird

In [None]:
# sample images
column_to_filter = 'common_name'
values_to_filter = 'bird'
sample_size = 1000
!time bash "$sample_images_script" "$urls_and_labels" "$column_to_filter" "$values_to_filter" "$sample_size" "$images_dir"

In [None]:
# copy samples file to drive
samples_file = images_dir + '/' + values_to_filter.replace(' ', '_') + '_sample_' + str(sample_size) + '.csv'
!cp "$samples_file" "$project_dir/images/$(basename "$samples_file")"

In [31]:
# add subset column to samples file
df = pd.read_csv(samples_file)
train_ratio = 0.8
df = add_subset_column(df, train_ratio)
df.to_csv(samples_file, index=False)

In [None]:
# download images
class_number = 5
!time bash "$download_images_script" "$samples_file" "$class_number" "$images_dir"

## Run MegaDetector

In [None]:
# run megadetector
!time python "$run_md_script" "$images_dir" "$md_file"

In [None]:
# copy megadetector output file to drive
!cp "$images_dir/$md_out_file" "$project_dir/$images_dir/$md_out_file"

## Run mewc-snip

In [None]:
# run mewc-snip
!time udocker --allow-root run \
  --volume "/content/$images_dir":/images \
  zaandahl/mewc-snip

## Copy snipped images to Drive

In [None]:
!mkdir -p "$project_dir/$images_dir/train"
!mkdir -p "$project_dir/$images_dir/test"
!mkdir -p "$project_dir/$images_dir/test2"

In [None]:
!time bash "$copy_snipped_images_script" "$images_dir/snips" "$project_dir/$images_dir" 5