# Get SVHN data

In [106]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import matplotlib.pyplot as plt
import numpy as np
import os
import sys
import tarfile
from IPython.display import display, Image
from PIL import Image
from scipy import ndimage
from sklearn.linear_model import LogisticRegression
from six.moves.urllib.request import urlretrieve
import cPickle as pickle
import matplotlib.image as mpimg

# Config the matlotlib backend as plotting inline in IPython
%matplotlib inline

In [107]:
###Load SVHN Data files, including mat data, for train and test datasets.

In [108]:
url = 'http://ufldl.stanford.edu/housenumbers/'
last_percent_reported = None

def download_progress_hook(count, blockSize, totalSize):
  """A hook to report the progress of a download. This is mostly intended for users with
  slow internet connections. Reports every 1% change in download progress.
  """
  global last_percent_reported
  percent = int(count * blockSize * 100 / totalSize)

  if last_percent_reported != percent:
    if percent % 5 == 0:
      sys.stdout.write("%s%%" % percent)
      sys.stdout.flush()
    else:
      sys.stdout.write(".")
      sys.stdout.flush()
      
    last_percent_reported = percent
        
def maybe_download(filename, expected_bytes, force=False):
  """Download a file if not present, and make sure it's the right size."""
  if force or not os.path.exists(filename):
    print('Attempting to download:', filename) 
    filename, _ = urlretrieve(url + filename, filename, reporthook=download_progress_hook)
    print('\nDownload Complete!')
  statinfo = os.stat(filename)
  if statinfo.st_size == expected_bytes:
    print('Found and verified', filename)
  else:
    raise Exception(
      'Failed to verify ' + filename + '. Can you get to it with a browser?')
  return filename

train_filename = maybe_download('train.tar.gz',404141560)
test_filename = maybe_download('test.tar.gz',276555967)

Found and verified train.tar.gz
Found and verified test.tar.gz


### Extract files

In [109]:
np.random.seed(133)

def maybe_extract(filename, force=False):
  root = os.path.splitext(os.path.splitext(filename)[0])[0]  # remove .tar.gz
  if os.path.isdir(root) and not force:
    # You may override by setting force=True.
    print('%s already present - Skipping extraction of %s.' % (root, filename))
  else:
    print('Extracting data for %s. This may take a while. Please wait.' % root)
    tar = tarfile.open(filename)
    sys.stdout.flush()
    tar.extractall()
    tar.close()
  data_folders = root
  
  print(data_folders)
  return data_folders
  
train_folders = maybe_extract(train_filename)
test_folders = maybe_extract(test_filename)

train already present - Skipping extraction of train.tar.gz.
train
test already present - Skipping extraction of test.tar.gz.
test


### Format .mat data

In [110]:
import h5py
import time 

pixel_depth = 255.0  # Number of levels per pixel.

class ImageData:
    def __init__(self, data_folder, imageDataMatFilePath):
        self.data_folder=data_folder
        self.file = os.path.join(data_folder, 'digitStruct.mat')
        self.file = h5py.File(self.file, 'r')
        self.names = self.file['digitStruct/name']
        self.boxes = self.file['digitStruct/bbox']
    
    def get_labels_and_dataset(self, is_train):
        image_vertices_count=4
        
        ''' 5 places + 1 for length'''
        digit_count=6
        desired_image_size=32

        #Images to be processed
        image_count = len(self.names)

        #Array to hold labels of digits in an image
        #Add one for length
        print("is_train:",is_train)
        if (is_train == True) :
            multiplier = 2
        else:
            multiplier = 1
        image_labels_array=np.zeros(multiplier * image_count * (digit_count)).reshape(multiplier * image_count,digit_count)
        dataset_basic = np.ndarray(shape=(multiplier * image_count, desired_image_size, desired_image_size),dtype=np.float32)

        print("image_labels: ",image_labels_array.shape)
        start_time = time.time()


        ##Iterate over all images and get dims of images's bbox ( and not digit bbox), and digit labels.
        for count in range(image_count):
            left=0.
            top=0.
            bottom=0.
            right=0.
            img_label=self.file[self.boxes[count].item()]["label"]
            img_left=self.file[self.boxes[count].item()]["left"]
            img_top=self.file[self.boxes[count].item()]["top"]
            img_height=self.file[self.boxes[count].item()]["height"]
            img_width=self.file[self.boxes[count].item()]["width"]
            num_digits=len(img_label)
            
            
            image_labels_array[count,:]=float(0)
            image_labels_array[count,0]=num_digits
            ##Get bbox and digits for each image
            for digit_counter in np.arange(num_digits):
                if(num_digits == 1):
                    tmp_label=img_label.value[0]
                    image_labels_array[count,digit_count - num_digits + digit_counter]=tmp_label
                    tmp_left=img_left.value[0]
                    tmp_top=img_top.value[0]
                    tmp_height=img_height.value[0]
                    tmp_width=img_width.value[0]
                else:
                    tmp_label=self.file[img_label.value[digit_counter].item()].value[0]
                    image_labels_array[count,digit_count - num_digits + digit_counter]=tmp_label
                    tmp_left=self.file[img_left.value[digit_counter].item()].value[0]
                    tmp_top=self.file[img_top.value[digit_counter].item()].value[0]
                    tmp_height=self.file[img_height.value[digit_counter].item()].value[0]
                    tmp_width=self.file[img_width.value[digit_counter].item()].value[0]
                tmp_bottom=tmp_top+tmp_height
                tmp_right=tmp_left+tmp_width

                #Find the largest margins by taking min of top and left, and max of right and bottom
                #Also reduce top and left margins, and increase right and bottom margings to avoid cutting digits
                # cases handling x==0 are for initialization
                if(left==0 or tmp_left < left):
                    left=tmp_left*0.95;
                if(top==0 or tmp_top < top):
                    top=tmp_top*0.95
                if(bottom ==0 or tmp_bottom > bottom):
                    bottom=tmp_bottom*1.05
                if(right==0 or tmp_right > right):
                    right=tmp_right*1.05
            fullname = os.path.join(self.data_folder, str(count+1)+".png")
            im = Image.open(fullname)
            #plt.imshow(im)
            #im.show()
            im_orig = im.crop((left, top, right, bottom)).resize([desired_image_size,desired_image_size], Image.ANTIALIAS).convert('L')
            dataset_basic[count]=im_orig
            #plt.imshow(im_orig, cmap='Greys_r')
            if (is_train == True):
                dataset_basic[image_count + count]=im_anticlock
                plt.imshow(im_anticlock)
                image_labels_array[image_count + count] = image_labels_array[count]
            
        print("image_labels_array: ",image_labels_array.shape, "\n shape: ",image_labels_array[0:2,:],\
              ",\n",image_labels_array[image_count:image_count+2,:])
      
        print("Time taken to find image bounding box:\n- %4.4f seconds ---" % (time.time() - start_time))
        print("\nsample:\n",dataset_basic[0:1,:])
        dataset_basic = (dataset_basic.astype(float) - pixel_depth / 2) / pixel_depth
        print("\nsample:\n",dataset_basic[0:1,:])
        print('Mean:', np.mean(dataset_basic))
        print('Standard deviation:', np.std(dataset_basic))
        
        
        return image_labels_array, dataset_basic 




In [111]:
train_image_data = ImageData(train_folders, 'digitStruct.mat')
train_image_labels, train_dataset_basic = train_image_data.get_labels_and_dataset(False)


is_train: False
image_labels:  (33402, 6)
image_labels_array:  (33402, 6) 
 shape:  [[ 2.  0.  0.  0.  1.  9.]
 [ 2.  0.  0.  0.  2.  3.]] ,
 []
Time taken to find image bounding box:
- 253.4891 seconds ---

sample:
 [[[  75.   77.   75. ...,   79.   80.   79.]
  [  76.   78.   76. ...,   81.   81.   81.]
  [  79.   78.   78. ...,   83.   83.   82.]
  ..., 
  [ 102.  103.  101. ...,  103.  104.  102.]
  [ 102.  105.  104. ...,  103.  102.  102.]
  [ 100.  106.  104. ...,  101.  103.  102.]]]

sample:
 [[[-0.20588235 -0.19803922 -0.20588235 ..., -0.19019608 -0.18627451
   -0.19019608]
  [-0.20196078 -0.19411765 -0.20196078 ..., -0.18235294 -0.18235294
   -0.18235294]
  [-0.19019608 -0.19411765 -0.19411765 ..., -0.1745098  -0.1745098
   -0.17843137]
  ..., 
  [-0.1        -0.09607843 -0.10392157 ..., -0.09607843 -0.09215686 -0.1       ]
  [-0.1        -0.08823529 -0.09215686 ..., -0.09607843 -0.1        -0.1       ]
  [-0.10784314 -0.08431373 -0.09215686 ..., -0.10392157 -0.09607843 -0.1

In [112]:
##Pickle Train Sets
train_pickle_file = 'SVHN_basic_train_labels.pickle'

try:
  f = open(train_pickle_file, 'wb')
  save = {
    'train_image_labels': train_image_labels
    }
  pickle.dump(save, f, 1)
  f.close()
except Exception as e:
  print('Unable to save data to', train_pickle_file, ':', e)
  raise

In [113]:
##Pickle Train Sets
train_pickle_file = 'SVHN_basic_train_data_basic.pickle'

try:
  f = open(train_pickle_file, 'wb')
  save = {
    'train_dataset': train_dataset_basic
    }
  pickle.dump(save, f, 1)
  f.close()
except Exception as e:
  print('Unable to save data to', train_pickle_file, ':', e)
  raise

statinfo = os.stat(train_pickle_file)
print('Compressed train pickle size:', statinfo.st_size)

Compressed train pickle size: 273629347


In [114]:
test_image_data = ImageData(test_folders, 'digitStruct.mat')
test_imagee_labels, test_dataset_basic = test_image_data.get_labels_and_dataset(False)

is_train: False
image_labels:  (13068, 6)
image_labels_array:  (13068, 6) 
 shape:  [[  1.   0.   0.   0.   0.   5.]
 [  3.   0.   0.   2.   1.  10.]] ,
 []
Time taken to find image bounding box:
- 97.2790 seconds ---

sample:
 [[[ 79.  79.  79. ...,  78.  78.  78.]
  [ 79.  80.  80. ...,  78.  77.  78.]
  [ 79.  80.  81. ...,  79.  79.  78.]
  ..., 
  [ 90.  85.  79. ...,  90.  90.  89.]
  [ 87.  82.  77. ...,  90.  90.  90.]
  [ 85.  81.  76. ...,  89.  88.  88.]]]

sample:
 [[[-0.19019608 -0.19019608 -0.19019608 ..., -0.19411765 -0.19411765
   -0.19411765]
  [-0.19019608 -0.18627451 -0.18627451 ..., -0.19411765 -0.19803922
   -0.19411765]
  [-0.19019608 -0.18627451 -0.18235294 ..., -0.19019608 -0.19019608
   -0.19411765]
  ..., 
  [-0.14705882 -0.16666667 -0.19019608 ..., -0.14705882 -0.14705882
   -0.15098039]
  [-0.15882353 -0.17843137 -0.19803922 ..., -0.14705882 -0.14705882
   -0.14705882]
  [-0.16666667 -0.18235294 -0.20196078 ..., -0.15098039 -0.15490196
   -0.15490196]]]
Mean

In [115]:
##Pickle Test Sets
test_pickle_file = 'SVHN_basic_test_labels.pickle'

try:
  f = open(test_pickle_file, 'wb')
  save = {
    'test_image_labels': test_imagee_labels
    }
  pickle.dump(save, f, 1)
  f.close()
except Exception as e:
  print('Unable to save data to', test_pickle_file, ':', e)
  raise

In [116]:
##Pickle Test Sets
test_pickle_file = 'SVHN_basic_test_data_basic.pickle'

try:
  f = open(test_pickle_file, 'wb')
  save = {
    'test_dataset': test_dataset_basic
    }
  pickle.dump(save, f, 1)
  f.close()
except Exception as e:
  print('Unable to save data to', test_pickle_file, ':', e)
  raise

statinfo = os.stat(test_pickle_file)
print('Compressed test data pickle size:', statinfo.st_size)

Compressed test data pickle size: 107053218
