# Prepare Images

Extracts images from the kaggle dogs-vs-cats dataset.

The train.zip file must be downloaded from:

    https://www.kaggle.com/c/dogs-vs-cats/data
    
And placed under var/download.

Some random images will be extracted from each class. Directories for training, validation, and test will be created.

In [1]:
import os
import re
import shutil
import pandas as pd
import zipfile
from IPython.display import display, HTML

In [3]:
# Setup directories
data_dir     = os.path.join('var', 'data')
download_dir = os.path.join('var', 'download')
train_zip    = os.path.join(download_dir, 'train.zip')

# Wipe pre-existing data
if os.path.isdir(data_dir):
    shutil.rmtree(data_dir)

os.makedirs(data_dir, exist_ok=True)
os.makedirs(download_dir, exist_ok=True)

In [4]:
# Make sure the zip file is downloaded
if not os.path.isfile(train_zip):
    url = 'https://www.kaggle.com/c/dogs-vs-cats/data'
    display(HTML(f"train1.zip must be downloaded from: <a href='{url}'>{url}</a>"))
    raise(Exception("Data not downloaded."))

## Extract Images

In [5]:
train_size      = 1000
validation_size = 500
test_size       = 500

def read_entries(zf):
    fname_re = re.compile(r'train/(cat|dog)\.(\d+)\.jpg')

    image_entries = []
    for e in zf.filelist:
        m = fname_re.search(e.filename)
        if m:
            label = m.group(1)
            id = int(m.group(2))
            image_entries.append([id, label, e.filename])

    image_entries = pd.DataFrame(image_entries, columns=['id', 'label', 'filename'])
    return image_entries

def extract_entries(zf, label, dataset, items):
    entry_dir = os.path.join(data_dir, dataset, label)
    os.makedirs(entry_dir, exist_ok=True)
    for idx, row in items.iterrows():
        with zf.open(row.filename, 'r') as zip_e_fh:
            content = zip_e_fh.read()
        full_file = os.path.join(entry_dir, os.path.basename(row.filename))
        with open(full_file, 'wb') as fh:
            fh.write(content)

with zipfile.ZipFile(train_zip) as zf:
    image_entries = read_entries(zf)
    for label in ['dog', 'cat']:
        lbl_data = image_entries[image_entries.label == label].sample(train_size + validation_size + test_size).copy()
        extract_entries(zf, label, 'train',      lbl_data[0 : train_size])
        extract_entries(zf, label, 'validation', lbl_data[train_size : train_size + validation_size])
        extract_entries(zf, label, 'test',       lbl_data[train_size + validation_size :])