## Download and Prepare Images

In [None]:
import pathlib
import string
import warnings
import random
import shutil

from PIL import Image

In [None]:
# download data

!wget https://download.microsoft.com/download/3/E/1/3E1C3F21-ECDB-4869-8368-6DEBA77B919F/kagglecatsanddogs_3367a.zip
!rm -rf raw_data ../data/train ../data/val ../data/test
!unzip "kagglecatsanddogs_3367a.zip" -d raw_data >/dev/null
!rm -rf kagglecatsanddogs_3367a.zip

In [None]:
# directory setup

data_dir = pathlib.Path().cwd().parent / 'data'
data_dir.mkdir(exist_ok=True)
original_dir = pathlib.Path().cwd() / 'raw_data' / 'PetImages'

In [None]:
# remove invalid images because they break the image data generators

with warnings.catch_warnings():
    warnings.filterwarnings("ignore", message=".*EXIF")
    for animal in ['cat', 'dog']:
        animal_dir = (original_dir / string.capwords(animal))
        imgs = animal_dir.glob('*.jpg')
        for img in imgs:
            try:
                Image.open(img)
            except OSError:
                img.unlink()
        print(f"valid {animal} images", len(list(animal_dir.glob('*.jpg'))))

In [None]:
# copy files to train/val/test folders

imgs = dict()
for animal in ['cat', 'dog']:
    imgs[animal] = list((original_dir / string.capwords(animal)).glob('*.jpg'))
    random.seed(42)
    random.shuffle(imgs[animal])

for dataset in ['train', 'val', 'test']:
    (data_dir / dataset).mkdir(exist_ok=True)
    for animal in ['cat', 'dog']:
        cur_dir = (data_dir / dataset / animal)
        cur_dir.mkdir(exist_ok=True)
        if dataset == 'train':
            files = imgs[animal][:10000]
        elif dataset == 'val':
            files = imgs[animal][10000:11500]
        elif dataset == 'test':
            files = imgs[animal][11500:]
        for file in files:
            shutil.copy(file, cur_dir / file.name)
        print(len(list(cur_dir.glob('*.jpg'))), "images in", cur_dir)

In [None]:
# clean up

!rm -rf raw_data