# Get SVHN data

In [66]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import matplotlib.pyplot as plt
import numpy as np
import os
import sys
import tarfile
from IPython.display import display, Image
from PIL import Image
from scipy import ndimage
from sklearn.linear_model import LogisticRegression
from six.moves.urllib.request import urlretrieve
import cPickle as pickle
import matplotlib.image as mpimg


# Config the matlotlib backend as plotting inline in IPython
%matplotlib inline

In [67]:
###Load SVHN Data files, including mat data, for train and test datasets.

In [68]:
url = 'http://ufldl.stanford.edu/housenumbers/'
local_path='../original_data_files/'
last_percent_reported = None

def download_progress_hook(count, blockSize, totalSize):
  """A hook to report the progress of a download. This is mostly intended for users with
  slow internet connections. Reports every 1% change in download progress.
  """
  global last_percent_reported
  percent = int(count * blockSize * 100 / totalSize)

  if last_percent_reported != percent:
    if percent % 5 == 0:
      sys.stdout.write("%s%%" % percent)
      sys.stdout.flush()
    else:
      sys.stdout.write(".")
      sys.stdout.flush()
      
    last_percent_reported = percent
        
def maybe_download(filename, expected_bytes, force=False):
  """Download a file if not present, and make sure it's the right size."""
  if force or not os.path.exists(local_path + filename):
    print('Attempting to download:', local_path+ filename) 
    filename, _ = urlretrieve(url + filename, local_path+filename, reporthook=download_progress_hook)
    print('\nDownload Complete!')
  statinfo = os.stat(local_path+filename)
  print(statinfo)
  if statinfo.st_size == expected_bytes:
    print('Found and verified', local_path+filename)
  else:
    raise Exception(
      'Failed to verify ' + local_path+filename + '. Can you get to it with a browser?')
  return local_path+ filename

train_filename = maybe_download('train.tar.gz',404141560)
test_filename = maybe_download('test.tar.gz',276555967)


nt.stat_result(st_mode=33206, st_ino=0L, st_dev=0L, st_nlink=0, st_uid=0, st_gid=0, st_size=404141560L, st_atime=1474310160L, st_mtime=1472399951L, st_ctime=1474310160L)
Found and verified ../original_data_files/train.tar.gz
nt.stat_result(st_mode=33206, st_ino=0L, st_dev=0L, st_nlink=0, st_uid=0, st_gid=0, st_size=276555967L, st_atime=1474310159L, st_mtime=1472399729L, st_ctime=1474310159L)
Found and verified ../original_data_files/test.tar.gz


### Extract files

In [69]:
np.random.seed(133)

def maybe_extract(filename, force=False):
  root = os.path.splitext(os.path.splitext(filename)[0])[0]  # remove .tar.gz
  if os.path.isdir(root) and not force:
    # You may override by setting force=True.
    print('%s already present - Skipping extraction of %s.' % (root, filename))
  else:
    print('Extracting data for %s. This may take a while. Please wait.' % root)
    tar = tarfile.open(filename)
    sys.stdout.flush()
    tar.extractall()
    tar.close()
  data_folders = root
  
  print(data_folders)
  return data_folders
  
train_folders = maybe_extract(train_filename)
test_folders = maybe_extract(test_filename)



../original_data_files/train already present - Skipping extraction of ../original_data_files/train.tar.gz.
../original_data_files/train
../original_data_files/test already present - Skipping extraction of ../original_data_files/test.tar.gz.
../original_data_files/test


### Format .mat data

In [70]:
import h5py
import time 

pixel_depth = 128.0  # Number of levels per pixel.

class ImageData:
    def __init__(self, data_folder, imageDataMatFilePath):
        self.data_folder=data_folder
        self.file = os.path.join(data_folder, 'digitStruct.mat')
        self.file = h5py.File(self.file, 'r')
        self.names = self.file['digitStruct/name']
        self.boxes = self.file['digitStruct/bbox']
    
    def get_labels_and_dataset(self, generate_synthetic, should_rotate):
        image_vertices_count=4
        
        ''' 5 places + 1 for length'''
        digit_count=6
        desired_image_size=32
        if generate_synthetic == True:
            cut_percentage=0.05
        else:
            cut_percentage=0.0
        
        if should_rotate == True:
            rotate_angle=10
        else:
            rotate_angle=0

        #Images to be processed
        image_count = len(self.names)

        #Array to hold labels of digits in an image
        #Add one for length
        print("generate_synthetic:",generate_synthetic, ", should_rotate:",should_rotate)
        image_labels_array=np.zeros(image_count * (digit_count)).reshape(image_count,digit_count)
        dataset_basic = np.ndarray(shape=(image_count, desired_image_size, desired_image_size),dtype=np.float32)

        print("image_labels: ",image_labels_array.shape)
        start_time = time.time()


        ##Iterate over all images and get dims of images's bbox ( and not digit bbox), and digit labels.
        for count in range(image_count):
            left=0.
            top=0.
            bottom=0.
            right=0.
            img_label=self.file[self.boxes[count].item()]["label"]
            img_left=self.file[self.boxes[count].item()]["left"]
            img_top=self.file[self.boxes[count].item()]["top"]
            img_height=self.file[self.boxes[count].item()]["height"]
            img_width=self.file[self.boxes[count].item()]["width"]
            num_digits=len(img_label)

            
            
            image_labels_array[count,:]=float(0)
            image_labels_array[count,0]=num_digits
            ##Get bbox and digits for each image
            for digit_counter in np.arange(num_digits):
                if(num_digits == 1):
                    tmp_label=img_label.value[0]
                    image_labels_array[count,digit_count - num_digits + digit_counter]=tmp_label
                    tmp_left=img_left.value[0]
                    tmp_top=img_top.value[0]
                    tmp_height=img_height.value[0]
                    tmp_width=img_width.value[0]
                else:
                    tmp_label=self.file[img_label.value[digit_counter].item()].value[0]
                    image_labels_array[count,digit_count - num_digits + digit_counter]=tmp_label
                    tmp_left=self.file[img_left.value[digit_counter].item()].value[0]
                    tmp_top=self.file[img_top.value[digit_counter].item()].value[0]
                    tmp_height=self.file[img_height.value[digit_counter].item()].value[0]
                    tmp_width=self.file[img_width.value[digit_counter].item()].value[0]
                #print("tmp_label:", tmp_label)

                tmp_bottom=tmp_top+tmp_height
                tmp_right=tmp_left+tmp_width

                #Find the largest margins by taking min of top and left, and max of right and bottom
                #Also reduce top and left margins, and increase right and bottom margings to avoid cutting digits
                # cases handling x==0 are for initialization
                if(left==0 or tmp_left < left):
                    left=tmp_left * (1 - cut_percentage);
                if(top==0 or tmp_top < top):
                    top=tmp_top * (1 - cut_percentage)
                if(bottom ==0 or tmp_bottom > bottom):
                    bottom=tmp_bottom * (1 - cut_percentage)
                if(right==0 or tmp_right > right):
                    right=tmp_right * (1 - cut_percentage)
            fullname = os.path.join(self.data_folder, str(count+1)+".png")
            im = Image.open(fullname)
            #plt.imshow(im)
            #im.show()
            if count % 2 == 0:
                rotation = rotate_angle
            else:
                rotation=-1 * rotate_angle
            
            #print("rotation: ", rotation)
            im_orig = im.rotate(rotation).crop((left, top, right, bottom)).resize([desired_image_size,desired_image_size]).convert('L')
            
            dataset_basic[count]=im_orig.rotate(rotate_angle)
            #plt.imshow(im_orig, cmap='Greys_r')
                       
        #print("image_labels_array: ",image_labels_array.shape, "\n shape: ",image_labels_array[0:2,:],\
        #      ",\n",image_labels_array[image_count:image_count+2,:])
      
        #print("Time taken to find image bounding box:\n- %4.4f seconds ---" % (time.time() - start_time))
        #print("\nsample:\n",dataset_basic[0:1,:])
        dataset_basic = (dataset_basic.astype(float) - pixel_depth / 2) / pixel_depth
        #print("\nsample:\n",dataset_basic[0:1,:])
        print('Mean:', np.mean(dataset_basic),',  Standard deviation:', np.std(dataset_basic))
        
        
        return image_labels_array, dataset_basic 




In [71]:
def pickle_data_to_file(pickle_filename, pickle_key,pickle_data):
    pickle_file = pickle_filename + '.pickle'

    try:
      f = open(pickle_file, 'wb')
      save = {
        pickle_key : pickle_data
        }
      pickle.dump(save, f, 1)
      f.close()
    except Exception as e:
      print('Unable to save data to', pickle_file, ':', e)
      raise

In [None]:
train_image_data = ImageData(train_folders, 'digitStruct.mat')
train_image_labels, train_dataset = train_image_data.get_labels_and_dataset(False, False)
pickle_data_to_file('train_image_labels','train_image_labels', train_image_labels)
pickle_data_to_file('train_dataset','train_dataset', train_dataset)


train_image_labels_truncated, train_dataset_truncated = train_image_data.get_labels_and_dataset(True, False)
pickle_data_to_file('train_dataset_truncated','train_dataset', train_dataset_truncated)

train_image_labels_rotated, train_dataset_rotated = train_image_data.get_labels_and_dataset(False, True)
pickle_data_to_file('train_dataset_rotated','train_dataset', train_dataset_rotated)


train_image_labels_truncated_rotated, train_dataset_truncated_rotated = train_image_data.get_labels_and_dataset(True, True)
pickle_data_to_file('train_dataset_truncated_rotated','train_dataset', train_dataset_truncated_rotated)



generate_synthetic: False , should_rotate: False
image_labels:  (33402L, 6L)
Mean: 0.381773043533 ,  Standard deviation: 0.393036510116
generate_synthetic: True , should_rotate: False
image_labels:  (33402L, 6L)
Mean: 0.38367424516 ,  Standard deviation: 0.39394778251
generate_synthetic: False , should_rotate: True
image_labels:  (33402L, 6L)
Mean: 0.314791382276 ,  Standard deviation: 0.445769861946
generate_synthetic: True , should_rotate: True
image_labels:  (33402L, 6L)
Mean: 0.315672676704 ,  Standard deviation: 0.447853451258


In [None]:
test_image_data = ImageData(test_folders, 'digitStruct.mat')
test_image_labels, test_dataset = test_image_data.get_labels_and_dataset(False,False)
pickle_data_to_file('test_image_labels','test_image_labels',test_image_labels)
pickle_data_to_file('test_dataset','test_dataset',test_dataset)


generate_synthetic: False , should_rotate: False
image_labels:  (13068L, 6L)
