In [1]:
# Import necessary tools and set seed

import glob
import numpy as np
import os
import shutil
import pandas as pd
np.random.seed(42)

In [2]:
# Copy train.zip file from GCP bucket

! gsutil cp gs://dog-vs-cat-arnaud-test/dog-vs-cat /tmp/train.zip

Copying gs://dog-vs-cat-arnaud-test/dog-vs-cat...
| [1 files][543.2 MiB/543.2 MiB]                                                
Operation completed over 1 objects/543.2 MiB.                                    


In [4]:
import zipfile

In [5]:
local_zip = '/tmp/train.zip' # local path of downloaded .zip file
zip_ref = zipfile.ZipFile(local_zip, 'r')
zip_ref.extractall('/tmp/train-unzip') # contents are extracted to '/tmp' folder
zip_ref.close()

In [6]:
files = glob.glob('/tmp/train-unzip/train/*') # creating a glob with all the pictures

cat_files = [fn for fn in files if 'cat' in fn] # list with all the cat pictures thanks to the files names
dog_files = [fn for fn in files if 'dog' in fn] # list of all the dog pictures thanks to the files names
len(cat_files), len(dog_files)

(12500, 12500)

In [7]:
cat_train = np.random.choice(cat_files, size=1500, replace=False) #Selecting 1500 random cat images for train
dog_train = np.random.choice(dog_files, size=1500, replace=False) #Selecting 1500 random dog images for train
cat_files = list(set(cat_files) - set(cat_train)) #removing the selected images from the pool of files
dog_files = list(set(dog_files) - set(dog_train))

cat_val = np.random.choice(cat_files, size=500, replace=False) #Selecting 500 random cat images for val
dog_val = np.random.choice(dog_files, size=500, replace=False) #Selecting 500 random dog images for val
cat_files = list(set(cat_files) - set(cat_val)) #removing the selected images from the pool of files
dog_files = list(set(dog_files) - set(dog_val))

cat_test = np.random.choice(cat_files, size=500, replace=False) #Selecting 500 random cat images for test
dog_test = np.random.choice(dog_files, size=500, replace=False) #Selecting 500 random dog images for test

print('Cat datasets:', cat_train.shape, cat_val.shape, cat_test.shape)
print('Dog datasets:', dog_train.shape, dog_val.shape, dog_test.shape)

Cat datasets: (1500,) (500,) (500,)
Dog datasets: (1500,) (500,) (500,)


In [8]:
# Combining cat and dog images for train, val and test files

train_files = np.concatenate([cat_train, dog_train])
validate_files = np.concatenate([cat_val, dog_val])
test_files = np.concatenate([cat_test, dog_test])

In [9]:
# Saving training, validation and test data files to notebook directory

train_dir = 'training_data'
val_dir = 'validation_data'
test_dir = 'test_data'

os.mkdir(train_dir) if not os.path.isdir(train_dir) else None
os.mkdir(val_dir) if not os.path.isdir(val_dir) else None
os.mkdir(test_dir) if not os.path.isdir(test_dir) else None

for fn in train_files:
    shutil.copy(fn, train_dir)

for fn in validate_files:
    shutil.copy(fn, val_dir)
    
for fn in test_files:
    shutil.copy(fn, test_dir)