# Snake Species Identification Challenge

**Authors**:
- Angus Mackenzie (1106817)
- Nathan Michlo (1386161)

------------------------

## Imports & Modules

In [52]:
import os
import sys
from pprint import pprint

import dotenv
from PIL import Image
import pandas as pd
from tqdm import tqdm

import urllib

**Add Local Modules to `PYTHON_PATH`**
- This assumes that jupyter server was launched from the `root` directory of the project

In [2]:
# SAVE ORIGINAL sys.path AND RESTORE
def restore_sys_path():
    global _ORIG_SYS_PATH
    if '_ORIG_SYS_PATH' not in globals():
        _ORIG_SYS_PATH = list(sys.path) # shallow copy
    sys.path = list(_ORIG_SYS_PATH)     # shallow copy

# APPEND TO sys.path
def add_python_path(path):
    sys.path.insert(0, os.path.abspath(path))
    
# Methods to visualise CNN activations: https://github.com/utkuozbulak/pytorch-cnn-visualizations
add_python_path('vendor/pytorch-cnn-visualizations')
# Mish activation function: https://github.com/digantamisra98/Mish
add_python_path('vendor/Mish')
# Variance of the Adaptive Learning Rate: https://github.com/LiyuanLucasLiu/RAdam
add_python_path('vendor/RAdam')
# Lookahead optimizer: https://github.com/alphadl/lookahead.pytorch
add_python_path('vendor/lookahead.pytorch')
# Ranger=RAdam+Lookahead: https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer
add_python_path('vendor/Ranger-Deep-Learning-Optimizer')

------------------------------

## Environment
File that stores environment variables

In [3]:
if dotenv.load_dotenv(dotenv.find_dotenv()):
    print(f'[LOADED]: {dotenv.find_dotenv()}')

[LOADED]: /home/nmichlo/downloads/tmp/snake-id/.env


------------------------

## Loading Data

In [6]:
# pretty much only need to change DATASET_DIR
DATASET_DIR          = os.environ.get('DATASET_DIR', 'data')
print(f'[DATASET LOCATION]: {DATASET_DIR}')

DATASET_SSIC_CLASSES = os.environ.get('DATASET_SSIC_CLASSES', os.path.join(DATASET_DIR, 'class_idx_mapping.csv'))
DATASET_SSIC_TRAIN   = os.environ.get('DATASET_SSIC_TRAIN', os.path.join(DATASET_DIR, 'train'))
DATASET_SSIC_TEST    = os.environ.get('DATASET_SSIC_TEST', os.path.join(DATASET_DIR, 'round1'))
OUTPUT_FOLDER        = os.environ.get('OUTPUT_FOLDER', 'out')

[DATASET LOCATION]: /home/nmichlo/downloads/datasets/ssic


In [21]:
# str -> int
NAME_CLASS_MAP = {name: cls for (name, cls) in pd.read_csv(DATASET_SSIC_CLASSES).values}
# int -> str
CLASS_NAME_MAP = {cls: name for (name, cls) in NAME_CLASS_MAP.items()}

In [39]:
def get_train_img_paths(validate=True):
    """
    Get all the paths of training images, verifying that images of any paths returned are actually valid.
    """
    img_paths_valid, img_paths_invalid = [], []
    # LOOP THROUGH CLASS FOLDERS
    for cls_name in tqdm(os.listdir(DATASET_SSIC_TRAIN)):
        cls_path = os.path.join(DATASET_SSIC_TRAIN, cls_name)
        # LOOP THROUGH CLASS IMAGES (IN CLASS FOLDER)
        for img_name in os.listdir(cls_path):
            img_path = os.path.join(cls_path, img_name)
            data = (img_path, int(cls_name[len('class-'):]))
            try:
                if validate:
                    img = Image.open(img_path)
                    img.verify()
                img_paths_valid.append(data)
            except (IOError, SyntaxError) as e:
                img_paths_invalid.append(data)
    if validate:
        return img_paths_valid, img_paths_invalid
    else:
        assert not img_paths_invalid
        return img_paths_valid

In [51]:
# tuples of (path, class_id)
VALID_IMG_PATHS_CLASS, INVALID_IMG_PATHS_CLASS = get_train_img_paths(validate=True)

print(f'#valid images:   {len(VALID_IMG_PATHS_CLASS)}')
print(f'#invalid images: {len(INVALID_IMG_PATHS_CLASS)}')

# Make sure that all classes appear in data and vice versa
assert len({class_id for path, class_id in VALID_IMG_PATHS} - set(CLASS_NAME_MAP)) == 0
assert len(set(CLASS_NAME_MAP) - {class_id for path, class_id in VALID_IMG_PATHS}) == 0

print(f'All {len(CLASS_NAME_MAP)} classes are present in data!')

100%|██████████| 45/45 [00:30<00:00,  1.50it/s]

#valid images:   82417
#invalid images: 184
All 45 classes are present in data!





----------------------------------------------------

In [None]:
# source article: https://medium.com/@Stormblessed/2460292bcfb
# annotated data: https://drive.google.com/uc?id=18dx_5Ngmc56fDRZ6YZA_elX-0ehtV5U6



In [None]:
testfile = urllib.URLopener().retrieve('https://drive.google.com/uc?id=18dx_5Ngmc56fDRZ6YZA_elX-0ehtV5U6', f'{DATASET_DIR}/annotations.json')