In [1]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
import os
import sys
import tarfile
from IPython.display import display, Image
from scipy.io import loadmat
from six.moves.urllib.request import urlretrieve
from six.moves import cPickle as pickle
%matplotlib inline

In [2]:
# Download the original SVHN dataset in compressed .tar.gz format.
# These contain varying-resolution images (in .png) with multi-digit labels of a number sequence for each image.
# Also contains a digitStruct.mat file which stores image name, positions, size, and label of the bounding boxes per digit.
url = 'http://ufldl.stanford.edu/housenumbers/'
last_percent_reported = None

def download_progress_hook(count, blockSize, totalSize):
    """A hook to report the progress of a download. This is mostly intended for users with
    slow internet connections. Reports every 5% change in download progress.
    """
    global last_percent_reported
    percent = int(count * blockSize * 100 / totalSize)

    if last_percent_reported != percent:
        if percent % 5 == 0:
            sys.stdout.write("%s%%" % percent)
            sys.stdout.flush()
        else:
            sys.stdout.write(".")
            sys.stdout.flush()
      
    last_percent_reported = percent

def maybe_download(filename, force=False):
    # Download a file if not present
    if force or not os.path.exists(filename):
        print('Attempting to download:', filename)
        filename, _ = urlretrieve(url + filename, filename, reporthook=download_progress_hook)
        print('\nDownload Complete!')
        statinfo = os.stat(filename)
        return filename
    else:
        print(filename + ' has already been downloaded!')

train_filename = maybe_download('train.tar.gz')
test_filename = maybe_download('test.tar.gz')
extra_filename = maybe_download('extra.tar.gz')

train.tar.gz has already been downloaded!
test.tar.gz has already been downloaded!
extra.tar.gz has already been downloaded!


In [3]:
# Extract downloaded .tar.gz files
np.random.seed(133)

def maybe_extract(filename, force=False):
    root = os.path.splitext(os.path.splitext(filename)[0])[0]  # Remove .tar.gz
    if os.path.isdir(root) and not force:
        # You may override by setting force=True.
        print('%s already present - Skipping extraction of %s.' % (root, filename))
    else:
        print('Extracting data for %s. This may take a while. Please wait.' % root)
        tar = tarfile.open(filename)
        sys.stdout.flush()
        tar.extractall()
        tar.close()
    data_folders = root
    print(data_folders)
    return data_folders
  
train_folders = maybe_extract('train.tar.gz')
test_folders = maybe_extract('test.tar.gz')
extra_folders = maybe_extract('extra.tar.gz')

train already present - Skipping extraction of train.tar.gz.
train
test already present - Skipping extraction of test.tar.gz.
test
extra already present - Skipping extraction of extra.tar.gz.
extra


In [4]:
# Helper function to extract data from the DigiStruct file into a Python dictionary.
# Ref: https://discussions.udacity.com/t/how-to-deal-with-mat-files/160657/5

import h5py

# The DigitStructFile is just a wrapper around the h5py data.  It basically references 
#    inf:              The input h5 matlab file
#    digitStructName   The h5 ref to all the file names
#    digitStructBbox   The h5 ref to all struc data
class DigitStructFile:
    def __init__(self, inf):
        self.inf = h5py.File(inf, 'r')
        self.digitStructName = self.inf['digitStruct']['name']
        self.digitStructBbox = self.inf['digitStruct']['bbox']

# getName returns the 'name' string for for the n(th) digitStruct. 
    def getName(self,n):
        return ''.join([chr(c[0]) for c in self.inf[self.digitStructName[n][0]].value])

# bboxHelper handles the coding difference when there is exactly one bbox or an array of bbox. 
    def bboxHelper(self,attr):
        if (len(attr) > 1):
            attr = [self.inf[attr.value[j].item()].value[0][0] for j in range(len(attr))]
        else:
            attr = [attr.value[0][0]]
        return attr

# getBbox returns a dict of data for the n(th) bbox. 
    def getBbox(self,n):
        bbox = {}
        bb = self.digitStructBbox[n].item()
        bbox['height'] = self.bboxHelper(self.inf[bb]["height"])
        bbox['label'] = self.bboxHelper(self.inf[bb]["label"])
        bbox['left'] = self.bboxHelper(self.inf[bb]["left"])
        bbox['top'] = self.bboxHelper(self.inf[bb]["top"])
        bbox['width'] = self.bboxHelper(self.inf[bb]["width"])
        return bbox

    def getDigitStructure(self,n):
        s = self.getBbox(n)
        s['name']=self.getName(n)
        return s

# getAllDigitStructure returns all the digitStruct from the input file.     
    def getAllDigitStructure(self):
        return [self.getDigitStructure(i) for i in range(len(self.digitStructName))]

# Return a restructured version of the dataset (one structure by boxed digit).
#
#   Return a list of such dicts :
#      'filename' : filename of the samples
#      'boxes' : list of such dicts (one by digit) :
#          'label' : 1 to 9 corresponding digits. 10 for digit '0' in image.
#          'left', 'top' : position of bounding box
#          'width', 'height' : dimension of bounding box
#
# Note: We may turn this to a generator, if memory issues arise.
    def getAllDigitStructure_ByDigit(self):
        imgDat = self.getAllDigitStructure()
        result = []
        structCnt = 1
        for i in range(len(imgDat)):
            item = { 'filename' : imgDat[i]["name"] }
            figures = []
            for j in range(len(imgDat[i]['height'])):
                figure = {}
                figure['height'] = imgDat[i]['height'][j]
                figure['label']  = imgDat[i]['label'][j]
                figure['left']   = imgDat[i]['left'][j]
                figure['top']    = imgDat[i]['top'][j]
                figure['width']  = imgDat[i]['width'][j]
                figures.append(figure)
            structCnt = structCnt + 1
            item['boxes'] = figures
            result.append(item)
        return result

In [5]:
# Run the DigitStructFile function for train, test and extra datasets.

def run_DSF(folder_name):
    path = os.path.join(folder_name, 'digitStruct.mat')
    dsf = DigitStructFile(path)
    print("Extracting data from %s. This may take a while. Please wait." % path)
    dataset = dsf.getAllDigitStructure_ByDigit()
    return dataset


train_data = run_DSF('train')
test_data = run_DSF('test')
extra_data = run_DSF('extra')

print("Complete!")
print(train_data[0])

Extracting data from train/digitStruct.mat. This may take a while. Please wait.
Extracting data from test/digitStruct.mat. This may take a while. Please wait.
Extracting data from extra/digitStruct.mat. This may take a while. Please wait.
Complete!
{'boxes': [{'width': 81.0, 'top': 77.0, 'label': 1.0, 'left': 246.0, 'height': 219.0}, {'width': 96.0, 'top': 81.0, 'label': 9.0, 'left': 323.0, 'height': 219.0}], 'filename': '1.png'}


In [6]:
# Get data as an array of pixel dimensions for each .png file.
from PIL import Image

def get_img_size(dataset, folder_name):
    img_size = np.ndarray([len(dataset),2])
    for i in np.arange(len(dataset)):
        filename = dataset[i]['filename']
        filepath = os.path.join(folder_name, filename)
        imp = Image.open(filepath)
        img_size[i, :] = imp.size[:]
    return img_size

train_imsize = get_img_size(train_data, 'train')
test_imsize = get_img_size(test_data, 'test')
extra_imsize = get_img_size(extra_data, 'extra')

In [7]:
# Print the max and min of the horizontal and vertical pixel length.
print(np.amax(train_imsize[:,0]), np.amax(train_imsize[:,1]))
print(np.amin(train_imsize[:,0]), np.amin(train_imsize[:,1]))

print(np.amax(test_imsize[:,0]), np.amax(test_imsize[:,1]))
print(np.amin(test_imsize[:,0]), np.amin(test_imsize[:,1]))

print(np.amax(extra_imsize[:,0]), np.amax(extra_imsize[:,1]))
print(np.amin(extra_imsize[:,0]), np.amin(extra_imsize[:,1]))

(876.0, 501.0)
(25.0, 12.0)
(1083.0, 516.0)
(31.0, 13.0)
(668.0, 415.0)
(22.0, 13.0)


In [8]:
# Since the dataset comes in varying image sizes, we will crop it into 32x32 images 
# with care taken to include all digits in the bounding boxes.
import PIL.Image as Image

def generate_dataset(data, folder):

    print("Generating dataset from %s. This may take a while. Please wait." % folder)
    dataset = np.ndarray([len(data),32,32,1], dtype='float32')
    labels = np.ones([len(data),6], dtype='int32') * 10 # Encode blank digits as 10
    for i in np.arange(len(data)):
        filename = data[i]['filename']
        filepath = os.path.join(folder, filename)
        imp = Image.open(filepath)
        boxes = data[i]['boxes']
        num_digit = len(boxes)
        labels[i,0] = num_digit # Encode index 0 of labels as the number of digits in the sequence. 
        
        top = np.ndarray([num_digit], dtype='float32')
        left = np.ndarray([num_digit], dtype='float32')
        height = np.ndarray([num_digit], dtype='float32')
        width = np.ndarray([num_digit], dtype='float32')
        
        for j in np.arange(num_digit):
            if j < 5: 
                labels[i,j+1] = boxes[j]['label']
                if boxes[j]['label'] == 10: labels[i,j+1] = 0 # Encode index 10 as digit 0
            else: print('#',i,'image has more than 5 digits.')
            
            top[j] = boxes[j]['top']
            left[j] = boxes[j]['left']
            height[j] = boxes[j]['height']
            width[j] = boxes[j]['width']
        
        im_top = np.amin(top)
        im_left = np.amin(left)
        im_height = np.amax(top) + height[np.argmax(top)] - im_top
        im_width = np.amax(left) + width[np.argmax(left)] - im_left
        
        im_top = np.floor(im_top - 0.1 * im_height)
        im_left = np.floor(im_left - 0.1 * im_width)
        im_bottom = np.amin([np.ceil(im_top + 1.2 * im_height), imp.size[1]])
        im_right = np.amin([np.ceil(im_left + 1.2 * im_width), imp.size[0]])

        im = imp.crop((int(im_left), int(im_top), int(im_right), int(im_bottom))).resize([32,32], Image.ANTIALIAS)
        im = np.dot(np.array(im, dtype='float32'), [[0.2989],[0.5870],[0.1140]]) # Convert rgb to grayscale.
        mean = np.mean(im, dtype='float32')
        std = np.std(im, dtype='float32')
        #im = (im - mean) / (1e-8 * std) # Apply GCN. Multiply by 1e-8 to avoid division by zero errors.
        if std < 1e-4: std = 1
        im = (im - mean) / std # Apply normalization
        dataset[i,:,:,:] = im[:,:,:]

    return dataset, labels

train_dataset, train_labels = generate_dataset(train_data, 'train')
print(train_dataset.shape, train_labels.shape)

test_dataset, test_labels = generate_dataset(test_data, 'test')
print(test_dataset.shape, test_labels.shape)

extra_dataset, extra_labels = generate_dataset(extra_data, 'extra')
print(extra_dataset.shape, extra_labels.shape)

Generating dataset from train. This may take a while. Please wait.
('#', 29929, 'image has more than 5 digits.')
((33402, 32, 32, 1), (33402, 6))
Generating dataset from test. This may take a while. Please wait.
((13068, 32, 32, 1), (13068, 6))
Generating dataset from extra. This may take a while. Please wait.
((202353, 32, 32, 1), (202353, 6))


In [9]:
# Delete image indexed 29929 as it contains more than 5 digits.
# This is treated as an outlier for our model.
train_dataset = np.delete(train_dataset, 29929, axis=0)
train_labels = np.delete(train_labels, 29929, axis=0)

In [10]:
# Split dataset into training, validation and test set.
# Ref: https://arxiv.org/pdf/1204.3968.pdf
import random

random.seed()

num_labels = 10
valid_ind = []
valid_ind_2 = []
train_ind = []
train_ind_2 = []

for i in np.arange(num_labels):
    valid_ind.extend(np.where(train_labels[:,1] == (i))[0][:400].tolist())
    train_ind.extend(np.where(train_labels[:,1] == (i))[0][400:].tolist())
    valid_ind_2.extend(np.where(extra_labels[:,1] == (i))[0][:200].tolist())
    train_ind_2.extend(np.where(extra_labels[:,1] == (i))[0][200:].tolist())

random.shuffle(valid_ind)
random.shuffle(train_ind)
random.shuffle(valid_ind_2)
random.shuffle(train_ind_2)

valid_dataset = np.concatenate((extra_dataset[valid_ind_2,:,:,:], train_dataset[valid_ind,:,:,:]), axis=0)
valid_labels = np.concatenate((extra_labels[valid_ind_2,:], train_labels[valid_ind,:]), axis=0)
train_dataset = np.concatenate((extra_dataset[train_ind_2,:,:,:], train_dataset[train_ind,:,:,:]), axis=0)
train_labels = np.concatenate((extra_labels[train_ind_2,:], train_labels[train_ind,:]), axis=0)

print('Training set dimensions: ', train_dataset.shape, train_labels.shape)
print('Validation set dimensions: ', valid_dataset.shape, valid_labels.shape)
print('Testing set dimensions: ', test_dataset.shape, test_labels.shape)

('Training set dimensions: ', (230070, 32, 32, 1), (230070, 6))
('Validation set dimensions: ', (5684, 32, 32, 1), (5684, 6))
('Testing set dimensions: ', (13068, 32, 32, 1), (13068, 6))


In [11]:
# Save to pickle file for later reuse.

pickle_file = 'SVHN_multi_2.pickle'

try:
    f = open(pickle_file, 'wb')
    save = {
    'train_dataset': train_dataset,
    'train_labels': train_labels,
    'test_dataset': test_dataset,
    'test_labels': test_labels,
    'valid_dataset': valid_dataset,
    'valid_labels': valid_labels,
    }
    pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
    f.close()
except Exception as e:
    print('Unable to save data to', pickle_file, ':', e)
    raise
    
statinfo = os.stat(pickle_file)
print('Compressed pickle size:', statinfo.st_size)

('Compressed pickle size:', 1025147096)
