# Preprocess images

1. Run MegaDetector on all images
2. Snip images
3. Copy snipped images to Kaggle Output

Note: Images must have been previously downloaded to Drive via Colab and then uploaded to Kaggle (zipped folder).

## Setup

In [None]:
!pip install megadetector

In [None]:
!git clone https://github.com/alexvmt/tiger_classification.git

In [None]:
%cd ../../

In [None]:
project_dir = 'kaggle/working/tiger_classification'

# scripts
scripts_dir = project_dir + '/scripts/'
run_md_script = scripts_dir + 'run_megadetector.py'
copy_snipped_images_script = scripts_dir + 'copy_snipped_images.sh'

# md
md_dir = 'megadetector'
!mkdir -p "$md_dir"
md_file = 'md_v5a.0.0.pt'
md_out_file = 'md_out.json'

# images dir
images_input_dir = 'kaggle/input/images/images'
images_output_dir = 'kaggle/working/images'
!mkdir -p "$images_output_dir"

# specify number of classes
num_classes = 5

# set parameters for snipping images
INPUT_DIR = images_input_dir
MD_FILE = md_out_file
SNIP_DIR = 'snips'
LOWER_CONF = 0.05
SNIP_SIZE = 600

In [None]:
!wget -O "$md_dir/$md_file" https://github.com/agentmorris/MegaDetector/releases/download/v5.0/md_v5a.0.0.pt

In [None]:
!wget -O visualization_utils.py "https://raw.githubusercontent.com/agentmorris/MegaDetector/refs/heads/main/megadetector/visualization/visualization_utils.py"

In [None]:
import os
import json
from tqdm import tqdm
from pathlib import Path
import visualization_utils as viz_utils

In [None]:
# from mewc-detect
def contains_animal(json_image):
    if 'detections' in json_image.keys():
        n = len(json_image['detections'])
        animal_there = False
        for i in range(0,n):
            if json_image['detections'][i]['category'] == "1":
                animal_there = True
        return(animal_there)
    else:
        return(False)

## Run MegaDetector

In [None]:
# run megadetector
!time python "$run_md_script" "$images_input_dir" "$md_dir/$md_file" "$md_dir"

## Snip images
Follow [mewc-snip](https://github.com/zaandahl/mewc-snip)

In [None]:
json_path = Path(md_dir,MD_FILE)
Path(SNIP_DIR).mkdir(parents=True, exist_ok=True)

with open(json_path, "r") as read_json:
    json_data = json.load(read_json)
print("Processing " + str(len(json_data['images'])) + " images from " + MD_FILE)
for json_image in tqdm(json_data['images']):
    try:
        if(contains_animal(json_image)):
            image_name = Path(json_image.get('file')).name
            image_stem = Path(json_image.get('file')).stem
            image_ext = Path(json_image.get('file')).suffix
            input_path = Path(INPUT_DIR,image_name)
            if(input_path.is_file()):
                pil_image = viz_utils.load_image(input_path)
                crops = viz_utils.crop_image(detections=json_image['detections'],image=pil_image,confidence_threshold=float(LOWER_CONF))
                crop_num = 0;
                for crop in crops:
                    if(json_image['detections'][crop_num].get('category')=='1'): #check if we are snipping an animal
                        resized_crop = viz_utils.resize_image(crop,int(SNIP_SIZE),int(SNIP_SIZE))
                        output_path = Path(SNIP_DIR,image_stem+'-'+str(crop_num)+image_ext)
                        resized_crop.save(output_path)
                        crop_num += 1
    except: pass

## Copy snipped images to Kaggle Ouput

In [None]:
# create target directory structure
!mkdir -p "$images_output_dir/train"
!mkdir -p "$images_output_dir/test"
!mkdir -p "$images_output_dir/test2"

In [None]:
# copy snipped images to kaggle output
!time bash "$copy_snipped_images_script" "snips" "$images_output_dir" "$num_classes"

In [None]:
!ls "$images_output_dir/train/class_1" | wc -l
!ls "$images_output_dir/train/class_2" | wc -l
!ls "$images_output_dir/train/class_3" | wc -l
!ls "$images_output_dir/train/class_4" | wc -l
!ls "$images_output_dir/train/class_5" | wc -l

In [None]:
!ls "$images_output_dir/test/class_1" | wc -l
!ls "$images_output_dir/test/class_2" | wc -l
!ls "$images_output_dir/test/class_3" | wc -l
!ls "$images_output_dir/test/class_4" | wc -l
!ls "$images_output_dir/test/class_5" | wc -l

In [None]:
!ls "$images_output_dir/test2/class_1" | wc -l

In [None]:
# remove everything from kaggle output that is not needed anymore
!rm -rf "$project_dir"