# Dogs vs. Cats Redux: Kernels Edition

* the Kaggle data is available at
https://www.kaggle.com/c/dogs-vs-cats-redux-kernels-edition/data
and has been uploaded to Amazon S3 for the sake of reproducibility

## Create necessary filestructure

In [None]:
import os

In [None]:
if not os.path.exists('data'):
    print('Creating directory tree..')
    os.makedirs('data/train/dogs', exist_ok=True)
    os.makedirs('data/train/cats', exist_ok=True)
    os.makedirs('data/validation/dogs', exist_ok=True)
    os.makedirs('data/validation/cats', exist_ok=True)

* download the training dataset from the Amazon bucket

In [None]:
train_zip_file = 'data/train.zip'

In [None]:
if not os.path.isfile(train_zip_file):
    print('Downloading training data archive. Please wait.')
    import requests
    
    # get stream
    r = requests.get('https://s3.eu-west-2.amazonaws.com/kaggledatafiles/train.zip',
                allow_redirects=False, stream=True)
    
    # write to file
    with open(train_zip_file, 'wb') as fd:
        for chunk in r.iter_content(chunk_size=1024):
            fd.write(chunk)

* validate checksum

In [None]:
import hashlib

In [None]:
def sha256_checksum(filename, block_size=65536):
    sha256 = hashlib.sha256()
    with open(filename, 'rb') as f:
        for block in iter(lambda: f.read(block_size), b''):
            sha256.update(block)
    return sha256.hexdigest()

In [None]:
correct_checksum = '36fbbfb947aeffeaf6f8ddf5178cd43d1b74aae150dff942eadb07df7929a6d7'
checksum = sha256_checksum(train_zip_file)
assert checksum == correct_checksum, 'Checksum mismatch!'

* unzip the downloaded archive

In [None]:
if not os.path.exists('data/raw'):
    print('Unzipping training data archive..')
    import zipfile
    with zipfile.ZipFile(train_zip_file, 'r') as zip_ref:
        zip_ref.extractall('data/raw')

## Class definitions for easy manipulation of image files

In [None]:
from collections.abc import Mapping, Sequence

In [None]:
class Pictures(Mapping):
    def __init__(self, filenames):
        self.files = filenames
        pic_list = [Picture(file) for file in filenames]
        animals = {pic.animal for pic in pic_list}
        for animal in animals:
            seq = [pic for pic in pic_list if pic.animal == animal]
            if animal == 'cat':
                cat_seq = CatSequence(seq)
            elif animal == 'dog':
                dog_seq = DogSequence(seq)
            else:
                raise NameError('Unidentified animal!')
        self._storage = {'cat':cat_seq, 'dog':dog_seq}
                
    def __getitem__(self, key):
        return self._storage[key]
    def __iter__(self):
        return iter(self._storage)
    def __len__(self):
        return len(self._storage)

    def __repr__(self):
        cname = type(self).__name__
        return "{0}({1})".format(cname, self.files)
    
    def __str__(self):
        return "\n".join(self.files)    

In [None]:
class CatSequence(AnimalSequence):
    def __init__(self, list_of_pictures):
        AnimalSequence.__init__(self, list_of_pictures)
        assert self._animal.pop() == 'cat', 'Expecting cats!'

In [None]:
class DogSequence(AnimalSequence):
    def __init__(self, list_of_pictures):
        AnimalSequence.__init__(self, list_of_pictures)
        assert self._animal.pop() == 'dog', 'Expecting dogs!'

In [None]:
class AnimalSequence(Sequence):
    def __init__(self, list_of_pictures):

        self._storage = {pic.index:pic for pic in list_of_pictures}
        
        self._animal = {pic.animal for pic in self._storage.values()}

        # sequence must contain a single animal
        assert len(self._animal) == 1, 'Error: multi-animal sequence!'

    @property
    def files(self):
        return (pic.file for pic in self._storage.values())
    
    def __len__(self):
        return len(self._storage)
    
    def __getitem__(self, key):
        if isinstance(key, slice):
            indices = range(*key.indices(len(self)))
            return self.__class__([self._storage[i] for i in indices])
        else: # plain index
            return self._storage[key]
    
    def __repr__(self):
        cname = type(self).__name__
        pic_list = list(self._storage.values())
        return "{0}({1})".format(cname, pic_list)
    
    def __str__(self):
        return "\n".join([pic.file for pic in self._storage.values()])

In [None]:
class Picture:
    separator = '.'
    
    def __init__(self, file):
        self.file = file
    
    @property
    def animal(self):
        return self._fields()[0]
    
    @property
    def index(self):
        return int(self._fields()[1])

    @property
    def extension(self):
        return self._fields()[2]

    def _fields(self):
        return self.file.split(Picture.separator)
    
    def __repr__(self):
        cname = type(self).__name__
        return "{0}({1})".format(cname, repr(self.file))
    
    def __str__(self):
        return self.file

## Fill folders with the right files

In [None]:
file_list = os.listdir('data/raw/train')

In [None]:
pics = Pictures(file_list)

In [None]:
# get the list of cats indexed 0-999
cats_0_1000 = pics['cat'][0:1000]

In [None]:
list(cats_0_1000.files)

In [None]:
# get the list of cats indexed 1000-1400
cats_1000_1400 = pics['cat'][1000:1401]

In [None]:
# get the list of dogs indexed 12500-13499 (?)


In [None]:
# get the list of dogs indexed 13500-13900 (?)
