## Dependencies

In [14]:
import os
import urllib
import tarfile
import shutil
import random


from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

### Step1:- Download and extract data

* Download both image and annotations tar.gz files
* Extract the content to Data/images and Data/annotations

In [20]:
#Oxford VGG dataset urls for image and annotations
urls = ['http://www.robots.ox.ac.uk/~vgg/data/pets/data/images.tar.gz',
       'http://www.robots.ox.ac.uk/~vgg/data/pets/data/annotations.tar.gz']

def download_and_extract(data_dir, download_dir):
    #Download the data from urls and extract it to datadir
    for url in urls:
        filename = url.split('/')[-1]
        if filename not in os.listdir(download_dir):
            print(f'Downloading {filename}')
            print(os.path.join(download_dir, filename))
            try:
                urllib.request.urlretrieve(url, filename=os.path.join(download_dir, filename))
                tf = tarfile.open(name=os.path.join(download_dir,filename))
                print(tf)
                tf.extractall(data_dir)
            except Exception as e:
                print(e.args)
        else:
            print(f'{filename} is already downloaded')
    
if 'Data' not in os.listdir('../'):
    os.mkdir(os.path.join('..', 'Data'))
             
download_and_extract('../Data', '../')

Downloading images.tar.gz
../images.tar.gz
<tarfile.TarFile object at 0x10b3dec10>
Downloading annotations.tar.gz
../annotations.tar.gz
<tarfile.TarFile object at 0x10b5d4610>


In [21]:
os.listdir('../Data')

['.DS_Store', 'images', 'annotations']

## Step2:- Annotate

* Convert "Abyssinian_100 1 1 1" to "Abyssinian.jpg" and "cat"
* Perform the same for all lines in trainval and test

In [22]:
def annotate(file, annotations={}):
    
    with open(file, 'r') as f:
        rows = f.read().splitlines()
        
    for row in rows:
        image, class_id, species, breed = row.split()
        class_name = '_'.join(image.split('_')[:-1])
        image = image + '.jpg'
        
        annotations[image] = 'cat' if class_name[0] != class_name[0].lower() else 'dog'
        
    return annotations

annotations = annotate('../Data/annotations/trainval.txt')
annotations = annotate('../Data/annotations/test.txt')

In [23]:
len(annotations)

7349

In [24]:
next(iter(annotations.items()))

('Abyssinian_100.jpg', 'cat')

## Step3:- Train Validation Split

* Create new folder stucture under train_test_data and move each image to corresponding folders
* Train Validation split - 80%-20%

In [25]:
new_directory = '../train_test_data'
classes = ['cat', 'dog']
sets = ['train', 'validation']

#Create this if not present
if not os.path.isdir(new_directory):
    os.mkdir(new_directory)
    
for set_name in sets:
    to_create = os.path.join(new_directory, set_name)
    if not os.path.isdir(to_create):
        os.mkdir(os.path.join(new_directory, set_name))
    
    
    for class_name in classes:
        to_create = os.path.join(new_directory, set_name, class_name)
        if not os.path.isdir(to_create):
            os.mkdir(os.path.join(new_directory, set_name, class_name))

In [26]:
for image, class_name  in annotations.items():
    target_set = 'validation' if random.random() <= 0.2 else 'train'
    dest_path = os.path.join(new_directory, target_set, class_name, image)
    _=shutil.copy(os.path.join('../Data/images', image), dest_path)
    

#### Checking the counts

In [29]:
sets_classes_count = {
    'train' : {'dog': 0, 'cat': 0},
    'validation' : {'dog': 0, 'cat': 0}
}

for set_name in sets:
    for class_name in classes:
        path = os.path.join(new_directory, set_name, class_name)
        count = len(os.listdir(path))
        sets_classes_count[set_name][class_name] = count
        print(f"{path} has {count} images")

../train_test_data/train/cat has 1886 images
../train_test_data/train/dog has 3971 images
../train_test_data/validation/cat has 485 images
../train_test_data/validation/dog has 1007 images


In [30]:
sets_classes_count

{'train': {'dog': 3971, 'cat': 1886}, 'validation': {'dog': 1007, 'cat': 485}}