# Get SVHN data

In [422]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import matplotlib.pyplot as plt
import numpy as np
import os
import sys
import tarfile
from IPython.display import display, Image
from scipy import ndimage
from sklearn.linear_model import LogisticRegression
from six.moves.urllib.request import urlretrieve
from six.moves import cPickle as pickle

# Config the matlotlib backend as plotting inline in IPython
%matplotlib inline

In [None]:
###Load SVHN Data files, including mat data, for train and test datasets.

In [423]:
url = 'http://ufldl.stanford.edu/housenumbers/'
last_percent_reported = None

def download_progress_hook(count, blockSize, totalSize):
  """A hook to report the progress of a download. This is mostly intended for users with
  slow internet connections. Reports every 1% change in download progress.
  """
  global last_percent_reported
  percent = int(count * blockSize * 100 / totalSize)

  if last_percent_reported != percent:
    if percent % 5 == 0:
      sys.stdout.write("%s%%" % percent)
      sys.stdout.flush()
    else:
      sys.stdout.write(".")
      sys.stdout.flush()
      
    last_percent_reported = percent
        
def maybe_download(filename, expected_bytes, force=False):
  """Download a file if not present, and make sure it's the right size."""
  if force or not os.path.exists(filename):
    print('Attempting to download:', filename) 
    filename, _ = urlretrieve(url + filename, filename, reporthook=download_progress_hook)
    print('\nDownload Complete!')
  statinfo = os.stat(filename)
  if statinfo.st_size == expected_bytes:
    print('Found and verified', filename)
  else:
    raise Exception(
      'Failed to verify ' + filename + '. Can you get to it with a browser?')
  return filename

train_filename = maybe_download('train.tar.gz',404141560)
test_filename = maybe_download('test.tar.gz',276555967)

Found and verified train.tar.gz
Found and verified test.tar.gz


### Extract files

In [424]:
np.random.seed(133)

def maybe_extract(filename, force=False):
  root = os.path.splitext(os.path.splitext(filename)[0])[0]  # remove .tar.gz
  if os.path.isdir(root) and not force:
    # You may override by setting force=True.
    print('%s already present - Skipping extraction of %s.' % (root, filename))
  else:
    print('Extracting data for %s. This may take a while. Please wait.' % root)
    tar = tarfile.open(filename)
    sys.stdout.flush()
    tar.extractall()
    tar.close()
  data_folders = root
  
  print(data_folders)
  return data_folders
  
train_folders = maybe_extract(train_filename)
test_folders = maybe_extract(test_filename)

train already present - Skipping extraction of train.tar.gz.
train
test already present - Skipping extraction of test.tar.gz.
test


In [None]:
### Format .mat data

In [425]:
import h5py


class ImageData:
    def __init__(self, imageDataMatFilePath):
        self.file = h5py.File(imageDataMatFilePath, 'r')
        self.names = self.file['digitStruct/name']
        self.boxes = self.file['digitStruct/bbox']

train_file = os.path.join(train_folders, 'digitStruct.mat')
image_data = ImageData(train_file)


In [450]:
from PIL import Image

image_vertices_count=4

''' 5 places + 1 for length'''
digit_count=6 

#Images to be processed
train_image_count = len(image_data.names)-33380
print(train_image_count)

#Array to hold labels of digits in an image
#Add one for index
image_labels_array=np.zeros(train_image_count*(digit_count+1)).reshape(train_image_count,digit_count+1)
image_dimention_array=np.zeros(train_image_count*(image_vertices_count+1)).reshape(train_image_count,image_vertices_count+1)
print("image_labels.shape",image_labels_array.shape, "image_dimention_array.shape:",image_dimention_array.shape)

##Iterate over all images and get dims of images's bbox ( and not digit bbox), and digit labels.
for count in range(train_image_count):
    left=0.
    top=0.
    bottom=0.
    right=0.
    img_label=image_data.file[image_data.boxes[count].item()]["label"]
    img_left=image_data.file[image_data.boxes[count].item()]["left"]
    img_top=image_data.file[image_data.boxes[count].item()]["top"]
    img_height=image_data.file[image_data.boxes[count].item()]["height"]
    img_width=image_data.file[image_data.boxes[count].item()]["width"]
    num_digits=len(img_label)
    print("label len:",len(img_label))
    
    ##Get bbox and digits for each image
    for digit_counter in np.arange(num_digits):
        if(num_digits == 1):
            tmp_label=img_label.value[0]
            image_labels_array[count,digit_counter]=tmp_label
            tmp_left=img_left.value[0]
            tmp_top=img_top.value[0]
            tmp_height=img_height.value[0]
            tmp_width=img_width.value[0]
        else:
            tmp_label=image_data.file[img_label.value[digit_counter].item()].value[0]
            image_labels_array[count,digit_counter]=tmp_label
            tmp_left=image_data.file[img_left.value[digit_counter].item()].value[0]
            tmp_top=image_data.file[img_top.value[digit_counter].item()].value[0]
            tmp_height=image_data.file[img_height.value[digit_counter].item()].value[0]
            tmp_width=image_data.file[img_width.value[digit_counter].item()].value[0]
        tmp_bottom=tmp_top+tmp_height
        tmp_right=tmp_left+tmp_width
        print("tmp_label: ",tmp_label,", tmp_left:",tmp_left,", tmp_top:",tmp_top,", tmp_height:",tmp_height,", tmp_width:",tmp_width)
        
        #Find the largest margins by taking min of top and left, and max of right and bottom
        #Also reduce top and left margins, and increase right and bottom margings to avoid cutting digits
        if(left==0 or tmp_left < left):
            left=tmp_left*0.95;
        if(top==0 or tmp_top < top):
            top=tmp_top*0.95
        if(bottom ==0 or tmp_bottom > bottom):
            bottom=tmp_bottom*1.02
        if(right==0 or tmp_right > right):
            right=tmp_right*1.02
        #print("left: ", left, ", top: ", top,", right: ", right,", bottom: ", bottom)
    print("left: ", left, ", top: ", top,", right: ", right,", bottom: ", bottom)
    fullname = os.path.join(train_folders, str(count+1)+".png")
    print(fullname)
    im = Image.open(fullname)
    im = im.crop((left, top, right, bottom)).resize([128,128], Image.ANTIALIAS)
    im.show()
    print('-'*80)

22
image_labels.shape (22, 7) image_dimention_array.shape: (22, 5)
label len: 2
tmp_label:  [ 1.] , tmp_left: [ 246.] , tmp_top: [ 77.] , tmp_height: [ 219.] , tmp_width: [ 81.]
tmp_label:  [ 9.] , tmp_left: [ 323.] , tmp_top: [ 81.] , tmp_height: [ 219.] , tmp_width: [ 96.]
left:  [ 233.7] , top:  [ 73.15] , right:  [ 427.38] , bottom:  [ 301.92]
train/1.png
--------------------------------------------------------------------------------
label len: 2
tmp_label:  [ 2.] , tmp_left: [ 77.] , tmp_top: [ 29.] , tmp_height: [ 32.] , tmp_width: [ 23.]
tmp_label:  [ 3.] , tmp_left: [ 98.] , tmp_top: [ 25.] , tmp_height: [ 32.] , tmp_width: [ 26.]
left:  [ 73.15] , top:  [ 23.75] , right:  [ 126.48] , bottom:  [ 62.22]
train/2.png
--------------------------------------------------------------------------------
label len: 2
tmp_label:  [ 2.] , tmp_left: [ 17.] , tmp_top: [ 5.] , tmp_height: [ 15.] , tmp_width: [ 8.]
tmp_label:  [ 5.] , tmp_left: [ 25.] , tmp_top: [ 5.] , tmp_height: [ 15.] , tm

In [None]:
import h5py

# The DigitStructFile is just a wrapper around the h5py data.  It basically references 
#    inf:              The input h5 matlab file
#    digitStructName   The h5 ref to all the file names
#    digitStructBbox   The h5 ref to all struc data
class DigitStructFile:
    def __init__(self, inf):
        self.inf = h5py.File(inf, 'r')
        self.digitStructName = self.inf['digitStruct']['name']
        self.digitStructBbox = self.inf['digitStruct']['bbox']

# getName returns the 'name' string for for the n(th) digitStruct. 
    def getName(self,n):
        return ''.join([chr(c[0]) for c in self.inf[self.digitStructName[n][0]].value])

# bboxHelper handles the coding difference when there is exactly one bbox or an array of bbox. 
    def bboxHelper(self,attr):
        if (len(attr) > 1):
            attr = [self.inf[attr.value[j].item()].value[0][0] for j in range(len(attr))]
        else:
            attr = [attr.value[0][0]]
        return attr

# getBbox returns a dict of data for the n(th) bbox. 
    def getBbox(self,n):
        bbox = {}
        bb = self.digitStructBbox[n].item()
        bbox['height'] = self.bboxHelper(self.inf[bb]["height"])
        bbox['label'] = self.bboxHelper(self.inf[bb]["label"])
        bbox['left'] = self.bboxHelper(self.inf[bb]["left"])
        bbox['top'] = self.bboxHelper(self.inf[bb]["top"])
        bbox['width'] = self.bboxHelper(self.inf[bb]["width"])
        return bbox

    def getDigitStructure(self,n):
        s = self.getBbox(n)
        s['name']=self.getName(n)
        return s

# getAllDigitStructure returns all the digitStruct from the input file.     
    def getAllDigitStructure(self):
        return [self.getDigitStructure(i) for i in range(len(self.digitStructName))]

# Return a restructured version of the dataset (one structure by boxed digit).
#
#   Return a list of such dicts :
#      'filename' : filename of the samples
#      'boxes' : list of such dicts (one by digit) :
#          'label' : 1 to 9 corresponding digits. 10 for digit '0' in image.
#          'left', 'top' : position of bounding box
#          'width', 'height' : dimension of bounding box
#
# Note: We may turn this to a generator, if memory issues arise.
    def getAllDigitStructure_ByDigit(self):
        pictDat = self.getAllDigitStructure()
        result = []
        structCnt = 1
        for i in range(len(pictDat)):
            item = { 'filename' : pictDat[i]["name"] }
            figures = []
            for j in range(len(pictDat[i]['height'])):
               figure = {}
               figure['height'] = pictDat[i]['height'][j]
               figure['label']  = pictDat[i]['label'][j]
               figure['left']   = pictDat[i]['left'][j]
               figure['top']    = pictDat[i]['top'][j]
               figure['width']  = pictDat[i]['width'][j]
               figures.append(figure)
            structCnt = structCnt + 1
            item['boxes'] = figures
            result.append(item)
        return result

In [None]:
train_folders = 'train'
test_folders = 'test'

train_fin = os.path.join(train_folders, 'digitStruct.mat')
train_dsf = DigitStructFile(train_fin)
train_data = train_dsf.getAllDigitStructure_ByDigit()

In [None]:
test_fin = os.path.join(test_folders, 'digitStruct.mat')
test_dsf = DigitStructFile(test_fin)
test_data = test_dsf.getAllDigitStructure_ByDigit()

In [None]:

from PIL import Image
train_imsize = np.ndarray([len(train_data),2])
for i in np.arange(len(train_data)):
    filename = train_data[i]['filename']
    fullname = os.path.join(train_folders, filename)
    im = Image.open(fullname)
    train_imsize[i, :] = im.size[:]


print(np.amax(train_imsize[:,0]), np.amax(train_imsize[:,1]))
print(np.amin(train_imsize[:,0]), np.amin(train_imsize[:,1]))

In [None]:


test_imsize = np.ndarray([len(test_data),2])
for i in np.arange(len(test_data)):
    filename = test_data[i]['filename']
    fullname = os.path.join(test_folders, filename)
    im = Image.open(fullname)
    test_imsize[i, :] = im.size[:]

print(np.amax(test_imsize[:,0]), np.amax(test_imsize[:,1]))
print(np.amin(test_imsize[:,0]), np.amin(test_imsize[:,1]))



In [None]:
import PIL.Image as Image

def generate_dataset(data, folder):

    dataset = np.ndarray([len(data),32,32,1], dtype='float32')
    labels = np.ones([len(data),6], dtype=int) * 10
    for i in np.arange(len(data)):
        filename = data[i]['filename']
        fullname = os.path.join(folder, filename)
        im = Image.open(fullname)
        boxes = data[i]['boxes']
        num_digit = len(boxes)
        labels[i,0] = num_digit
        top = np.ndarray([num_digit], dtype='float32')
        left = np.ndarray([num_digit], dtype='float32')
        height = np.ndarray([num_digit], dtype='float32')
        width = np.ndarray([num_digit], dtype='float32')
        for j in np.arange(num_digit):
            if j < 5: 
                labels[i,j+1] = boxes[j]['label']
                if boxes[j]['label'] == 10: labels[i,j+1] = 0
            else: print('#',i,'image has more than 5 digits.')
            top[j] = boxes[j]['top']
            left[j] = boxes[j]['left']
            height[j] = boxes[j]['height']
            width[j] = boxes[j]['width']
        
        im_top = np.amin(top)
        im_left = np.amin(left)
        im_height = np.amax(top) + height[np.argmax(top)] - im_top
        im_width = np.amax(left) + width[np.argmax(left)] - im_left
        
        im_top = np.floor(im_top - 0.1 * im_height).astype(int)
        im_left = np.floor(im_left - 0.1 * im_width).astype(int)
        im_bottom = np.amin([np.ceil(im_top + 1.2 * im_height), im.size[1]]).astype(int)
        im_right = np.amin([np.ceil(im_left + 1.2 * im_width), im.size[0]]).astype(int)

        im = im.crop((im_left, im_top, im_right, im_bottom)).resize([32,32], Image.ANTIALIAS)
        im = np.dot(np.array(im, dtype='float32'), [[0.2989],[0.5870],[0.1140]])
        mean = np.mean(im, dtype='float32')
        std = np.std(im, dtype='float32', ddof=1)
        if std < 1e-4: std = 1.
        im = (im - mean) / std
        dataset[i,:,:,:] = im[:,:,:]

    return dataset, labels

train_dataset, train_labels = generate_dataset(train_data, train_folders)
print(train_dataset.shape, train_labels.shape)

test_dataset, test_labels = generate_dataset(test_data, test_folders)
print(test_dataset.shape, test_labels.shape)



In [None]:
tmp_img=train_dataset[0]
print(tmp_img[:,:,0].shape)
#tmp_img[:,:,0].show()
image_data=tmp_img[:,:,0]
fig, plt_axes_arr=plt.subplots(2, 2)
plt_axes_arr[0,0].imshow(image_data,cmap='Greys')

image_data = (image_data.astype(float) - 
                    255.0 / 2) / 255.0
plt_axes_arr[0,1].imshow(image_data,cmap='Greys')
plt.show()

In [None]:
#mat_contents = sio.loadmat('train/digitStruct.mat')
f = h5py.File('train/digitStruct.mat')
names = f.get('digitStruct/name') 
bboxes = f.get('digitStruct/bbox') 

In [None]:
print(f[names[0])

In [None]:
save('train/digitStruct.mat','-v7')