### Download and extract data

In [1]:
from __future__ import print_function
import matplotlib.pyplot as plt
import numpy as np
import os
import sys
import tarfile
from IPython.display import Image
from scipy import ndimage
from six.moves.urllib.request import urlretrieve
from six.moves import cPickle as pickle
%matplotlib inline

In [2]:
url = 'http://ufldl.stanford.edu/housenumbers/'

def maybe_download(filename, force=False):
    """Download a file if not present, and make sure it's the right size."""
    if force or not os.path.exists(filename):
        filename, _ = urlretrieve(url + filename, filename)
    statinfo = os.stat(filename)
    print('Found and verified', filename)
    return filename

train_filename = maybe_download('train.tar.gz')
test_filename = maybe_download('test.tar.gz')
extra_filename = maybe_download('extra.tar.gz')

Found and verified train.tar.gz
Found and verified test.tar.gz
Found and verified extra.tar.gz


In [3]:
np.random.seed(133)

def maybe_extract(filename, force=False):
    root = os.path.splitext(os.path.splitext(filename)[0])[0]  # remove .tar.gz
    
    if os.path.isdir(root) and not force:
        # You may override by setting force=True.
        print('%s is already presented - Skipping extraction of %s.' % (root, filename))
    else:
        print('Extracting %s file data. This may take a while. Please wait.' % root)
        tar = tarfile.open(filename)
        sys.stdout.flush()
        tar.extractall()
        tar.close()
        print('File %s is successfully extracted into %s directory.' % (filename, root))        
    
    return root

train_folder = maybe_extract(train_filename)
test_folder = maybe_extract(test_filename)
extra_folder = maybe_extract(extra_filename)

train is already presented - Skipping extraction of train.tar.gz.
test is already presented - Skipping extraction of test.tar.gz.
extra is already presented - Skipping extraction of extra.tar.gz.


### Feature Selection

In [None]:
import h5py

# The DigitStructFile is just a wrapper around the G data.  It basically references 
#     file_:            The input h5 matlab file
#     digitStructName   The h5 ref to all the file names
#     digitStructBbox   The h5 ref to all struc data
class DigitStructsWrapper:
    def __init__(self, file_, start_ = 0, end_ = 0):
        self.file_ = h5py.File(file_, 'r')
        self.names = self.file_['digitStruct']['name'][start_:end_] if end_ > 0 else self.file_['digitStruct']['name']
        self.bboxes = self.file_['digitStruct']['bbox'][start_:end_] if end_ > 0 else self.file_['digitStruct']['bbox']
        self.collectionSize = len(self.names)
        print("\n%s file structure contain %d entries" % (file_, self.collectionSize))
        
        
    def bboxHelper(self, keys_):
        """
        Method handles the coding difference when there is exactly one bbox or an array of bbox. 
        """
        if (len(keys_) > 1):
            val = [self.file_[keys_.value[j].item()].value[0][0] for j in range(len(keys_))]
        else:
            val = [keys_.value[0][0]]
        return val

    
    # getBbox returns a dict of data for the n(th) bbox. 
    def getBbox(self, n):
        bbox = {}
        bb = self.bboxes[n].item()
        bbox['height'] = self.bboxHelper(self.file_[bb]["height"])
        bbox['left'] = self.bboxHelper(self.file_[bb]["left"])
        bbox['top'] = self.bboxHelper(self.file_[bb]["top"])
        bbox['width'] = self.bboxHelper(self.file_[bb]["width"])
        bbox['label'] = self.bboxHelper(self.file_[bb]["label"])
        return bbox

    
    def getName(self, n):
        """
        Method returns the filename for the n(th) digitStruct. Since each letter is stored in a structure 
        as array of ANSII char numbers we should convert it back by calling chr function.
        """
        return ''.join([chr(c[0]) for c in self.file_[self.names[n][0]].value])

    
    def getNumberStructure(self,n):
        s = self.getBbox(n)
        s['name']=self.getName(n)
        return s

    def getAllNumbersStructure(self):
        """
        Method returns an array, which contains information about every image.
        This info contains: positions, labels 
        """
        return [self.getNumberStructure(i) for i in range(self.collectionSize)]

    
    # Return a restructured version of the dataset (one object per digit in 'boxes').
    #
    #   Return a list of dicts :
    #      'filename' : filename of the samples
    #      'boxes' : list of dicts (one by digit) :
    #          'label' : 1 to 9 corresponding digits. 10 for digit '0' in image.
    #          'left', 'top' : position of bounding box
    #          'width', 'height' : dimension of bounding box
    #
    # Note: We may turn this to a generator, if memory issues arise.
    def getAllNumbersRestructured(self): # getAllDigitStructure_ByDigit
        numbersData = self.getAllNumbersStructure()
        
        result = []
        for numData in numbersData:
            metadatas = []
            for i in range(len(numData['height'])):
                metadata = {}
                metadata['height'] = numData['height'][i]
                metadata['label']  = numData['label'][i]
                metadata['left']   = numData['left'][i]
                metadata['top']    = numData['top'][i]
                metadata['width']  = numData['width'][i]
                metadatas.append(metadata)
                
            result.append({ 'boxes':metadatas, 'name':numData["name"] })
        print("Dataset size:", len(result))    
        print("Object structure: ", result[0])
        
        return result

In [None]:
file_ = os.path.join(train_folder, 'digitStruct.mat')
dsf = DigitStructsWrapper(file_)
train_data = dsf.getAllNumbersRestructured()


train/digitStruct.mat file structure contain 33402 entries


In [None]:
file_ = os.path.join(test_folder, 'digitStruct.mat')
dsf = DigitStructsWrapper(file_)
test_data = dsf.getAllNumbersRestructured()

In [None]:
file_ = os.path.join(extra_folder, 'digitStruct.mat')
dsf = DigitStructsWrapper(file_)
extra_data = dsf.getAllNumbersRestructured()

In [None]:
def stat(data):
    label_count = {}
    for i in data:
        label_count[len(i["boxes"])] = label_count.get(len(i["boxes"]), 0)  + 1
    return label_count

In [None]:
train_stat = stat(train_data)
test_stat = stat(test_data)
extra_stat = stat(extra_data)
print(train_stat)
print(test_stat)
print(extra_stat)

In [None]:
import matplotlib.pyplot as plt

plt.figure(1)
plt.subplots_adjust(hspace=1)
plt.figure(figsize=(12,3.5))

plt.subplot(131)
plt.bar(range(len(train_stat)), train_stat.values(), align='center')
plt.xticks(range(len(train_stat)),train_stat.keys())
plt.title('Train')
plt.xlabel('Labels')
plt.ylabel('Occurencies')

plt.subplot(132)
plt.bar(range(len(test_stat)), test_stat.values(), align='center')
plt.xticks(range(len(test_stat)),test_stat.keys())
plt.title('Test')
plt.xlabel('Labels')

plt.subplot(133)
plt.bar(range(len(extra_stat)), extra_stat.values(), align='center')
plt.xticks(range(len(extra_stat)),extra_stat.keys())
plt.title('Extra')
plt.xlabel('Labels')

plt.show()

Delete the data with digits labels more than 4.

In [None]:
train_data = filter(lambda a: len(a["boxes"]) < 5, train_data)
test_data = filter(lambda a: len(a["boxes"]) < 5, test_data)
extra_data = filter(lambda a: len(a["boxes"]) < 5, extra_data)
train_stat = stat(train_data)
test_stat = stat(test_data)
extra_stat = stat(extra_data)
print(stat(train_data))
print(stat(test_data))
print(stat(extra_data))

In [None]:
import matplotlib.pyplot as plt

plt.figure(1)
plt.subplots_adjust(hspace=1)
plt.figure(figsize=(12,3.5))

plt.subplot(131)
plt.bar(range(len(train_stat)), train_stat.values(), align='center')
plt.xticks(range(len(train_stat)),train_stat.keys())
plt.title('Train')
plt.xlabel('Labels')
plt.ylabel('Occurencies')

plt.subplot(132)
plt.bar(range(len(test_stat)), test_stat.values(), align='center')
plt.xticks(range(len(test_stat)),test_stat.keys())
plt.title('Test')
plt.xlabel('Labels')

plt.subplot(133)
plt.bar(range(len(extra_stat)), extra_stat.values(), align='center')
plt.xticks(range(len(extra_stat)),extra_stat.keys())
plt.title('Extra')
plt.xlabel('Labels')

plt.show()

### Image Reformation

In [None]:
from PIL import Image

def print_data_stats(data, folder):
    data_imgSize = np.ndarray([len(data),2])

    for i in np.arange(len(data)):
        filename = data[i]['name']
        filepath = os.path.join(folder, filename)
        data_imgSize[i, :] = Image.open(filepath).size[:]

    max_w, max_h = np.amax(data_imgSize[:,0]), np.amax(data_imgSize[:,1])
    min_w, min_h = np.amin(data_imgSize[:,0]), np.amin(data_imgSize[:,1])
    mean_w, mean_h = np.mean(data_imgSize[:,0]), np.mean(data_imgSize[:,1])
    print(folder, "max width and height:", max_w, max_h) 
    print(folder, "min width and height:", min_w, min_h)
    print(folder, "mean width and height:", mean_w, mean_h, "\n")
    
    max_w_i, max_h_i = np.where(data_imgSize[:,0] == max_w), np.where(data_imgSize[:,1] == max_h)
    print(folder, "max width indicies:", max_w_i) 
    print(folder, "max height indicies:", max_h_i, "\n")
    
    
    min_w_i, min_h_i = np.where(data_imgSize[:,0] == min_w), np.where(data_imgSize[:,1] == min_h)
    print(folder, "min width indicies:", min_w_i) 
    print(folder, "min height indicies:", min_h_i, "\n***\n")

In [None]:
print_data_stats(train_data, train_folder)
print_data_stats(test_data, test_folder)
print_data_stats(extra_data, extra_folder)

In [None]:
img_size = 32

def prepare_images(samples, folder):
    print("Started preparing images for convnet...")
    
    prepared_images = np.ndarray([len(samples),img_size,img_size,1], dtype='float32')
    actual_numbers = np.ones([len(samples),5], dtype=int) * 10
    files = []
    for i in range(len(samples)):
        filename = samples[i]['name']
        filepath = os.path.join(folder, filename)
        image = Image.open(filepath)
        boxes = samples[i]['boxes']
        number_length = len(boxes)
        files.append(filename)
        
        # at 0 index we store length of a label. 3 -> 1; 123-> 3, 12543 -> 5
        actual_numbers[i,0] = number_length
        
        top = np.ndarray([number_length], dtype='float32')
        left = np.ndarray([number_length], dtype='float32')
        height = np.ndarray([number_length], dtype='float32')
        width = np.ndarray([number_length], dtype='float32')
        
        for j in range(number_length):
            # here we use j+1 since first entry used by label length
            actual_numbers[i,j+1] = boxes[j]['label']
            if boxes[j]['label'] == 10: # Replacing 10 with 0
                actual_numbers[i,j+1] = 0
                
            top[j] = boxes[j]['top']
            left[j] = boxes[j]['left']
            height[j] = boxes[j]['height']
            width[j] = boxes[j]['width']
        
        img_min_top = np.amin(top)
        img_min_left = np.amin(left)
        img_height = np.amax(top) + height[np.argmax(top)] - img_min_top
        img_width = np.amax(left) + width[np.argmax(left)] - img_min_left

        img_left = np.floor(img_min_left - 0.1 * img_width)
        img_top = np.floor(img_min_top - 0.1 * img_height)
        img_right = np.amin([np.ceil(img_left + 1.2 * img_width), image.size[0]])
        img_bottom = np.amin([np.ceil(img_top + 1.2 * img_height), image.size[1]])
            
        image = image.crop((img_left, img_top, img_right, img_bottom)).resize([img_size, img_size], Image.ANTIALIAS) # Resize image to 32x32
        image = np.dot(np.array(image, dtype='float32'), [[0.2989],[0.5870],[0.1140]]) # Convert image to the grayscale

        mean = np.mean(image, dtype='float32')
        std = np.std(image, dtype='float32', ddof=1)
        if std < 0.0001: 
            std = 1.0
        image = (image - mean) / std
        prepared_images[i,:,:] = image[:,:,:]
        
    print("Completed. Images cropped, resized and grayscaled")
    
    return prepared_images, actual_numbers, files

In [None]:
train_data, train_labels, _ = prepare_images(train_data, train_folder)
print(train_data.shape, train_labels.shape)

In [None]:
test_data, test_labels, test_filenames = prepare_images(test_data, test_folder)
print(test_data.shape, test_labels.shape)

In [None]:
extra_data, extra_labels, _ = prepare_images(extra_data, extra_folder)
print(extra_data.shape, extra_data.shape)

In [None]:
from sklearn.utils import shuffle

# Here we add new data to our training set from extra set.
# Then we remove this part from memory to free it
train_data_temp = np.concatenate((train_data, extra_data[:40000, :, :, :]))
extra_data_temp = np.delete(extra_data, np.arange(40000), axis=0)

train_labels_temp = np.concatenate((train_labels, extra_labels[:40000]))
extra_labels_temp = np.delete(extra_labels, np.arange(40000), axis=0)

# And then we shuffle all the data we have
train_data_temp, train_labels_temp = shuffle(train_data_temp, train_labels_temp)
extra_data_temp, extra_labels_temp = shuffle(extra_data_temp, extra_labels_temp)
test_data_temp, test_labels_temp, test_filenames_temp = shuffle(test_data, test_labels, test_filenames)

print("Train shapes:", train_data_temp.shape, train_labels_temp.shape)
print("Extra shapes:", extra_data_temp.shape, extra_labels_temp.shape)
print("Test shapes:", test_data_temp.shape, test_labels_temp.shape)

In [None]:
pickle_file = 'SVHN2.pickle'

try:
    f = open(pickle_file, 'wb')
    save = {
        'train_data': train_data_temp,
        'train_labels': train_labels_temp,
        'test_data': test_data_temp,
        'test_labels': test_labels_temp,
        'test_filenames': test_filenames_temp,
        'valid_data': extra_data_temp, # The rest of extra data will be used 
        'valid_labels': extra_labels_temp # as validation set during model training
        }
    pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
    f.close()
except Exception as e:
    print('Unable to save data to', pickle_file, ':', e)
    raise
    
statinfo = os.stat(pickle_file)
print('Compressed pickle size:', statinfo.st_size)

### Examining Labels

In [None]:
from collections import Counter

train_num_length = Counter(train_labels_temp[:,0])
test_num_length = Counter(test_labels_temp[:,0])
extra_num_length = Counter(extra_labels_temp[:,0])

In [None]:
import matplotlib.pyplot as plt

plt.figure(2)
plt.subplots_adjust(hspace=1)
plt.figure(figsize=(12,3.5))

plt.subplot(131)
plt.bar(train_num_length.keys(), train_num_length.values(), align='center')
plt.xticks(train_num_length.keys())
plt.title('Train')
plt.xlabel('Labels')
plt.ylabel('Occurencies')

plt.subplot(132)
plt.bar(test_num_length.keys(), test_num_length.values(), align='center')
plt.xticks(test_num_length.keys())
plt.title('Test')
plt.xlabel('Labels')

plt.subplot(133)
plt.bar(extra_num_length.keys(), extra_num_length.values(), align='center')
plt.xticks(test_num_length.keys())
plt.title('Validation')
plt.xlabel('Labels')
plt.show()

In [None]:
pickle_file = 'SVHN2.pickle'

with open(pickle_file, 'rb') as f:
    save = pickle.load(f)
    train_labels = save['train_labels']
    test_labels = save['test_labels']
    valid_labels = save['valid_labels']
    del save

In [None]:
from collections import Counter

# Remove classes of empty labels
train_digits = Counter(train_labels.flatten()[np.where(train_labels.flatten() != 10)])
test_digits = Counter(test_labels.flatten()[np.where(test_labels.flatten() != 10)])
valid_digits = Counter(valid_labels.flatten()[np.where(valid_labels.flatten() != 10)])

In [None]:
plt.figure(3)
plt.subplots_adjust(hspace=1)
plt.figure(figsize=(12,3.5))

plt.subplot(131)
plt.bar(train_digits.keys(), train_digits.values(), align='center')
plt.xticks(train_digits.keys())
plt.title('Train')
plt.xlabel('Labels')
plt.ylabel('Occurencies')

plt.subplot(132)
plt.bar(test_digits.keys(), test_digits.values(), align='center')
plt.xticks(test_digits.keys())
plt.title('Test')
plt.xlabel('Labels')

plt.subplot(133)
plt.bar(valid_digits.keys(), valid_digits.values(), align='center')
plt.xticks(valid_digits.keys())
plt.title('Validation')
plt.xlabel('Labels')

plt.show()