# Pre-procesissing Multi Digit SVHN Dataset: Step 1

- Arman Uygur # au2205
- Jonathan Galsurkar #jfg2150
- Nitesh Surtani #ns3148

In [1]:
"""
Adapted from https://github.com/thomalm/svhn-multi-digit
"""

import os
import sys
import tarfile
import numpy as np
import json
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from PIL import Image, ImageDraw
from IPython.display import display, Image, HTML
import h5py

plt.rcParams['figure.figsize'] = (16.0, 4.0)
%matplotlib inline

In [3]:
base_dir = "data/"

if not os.path.isfile(base_dir + "train_bounding_boxes.csv"):
    print("Run Preprocessing on training data in this sequence")

if not os.path.isfile(base_dir + "test_bounding_boxes.csv"):
    print("Run Preprocessing on testing datain this sequence")

if not os.path.isfile(base_dir + "extra_bounding_boxes.csv"):
    print("Run Preprocessing on extra data in this sequence")

In [6]:
# # Preprocessing for Extra 
process = "extra"
data_dir = "data/extra/"

In [7]:
from PIL import Image
image_size_file = base_dir + process + "_image_sizes.csv"

i=0
if not os.path.isfile(image_size_file):
    sizes = []
    size=[]
    items = [img for img in os.listdir(data_dir) if img.endswith('.png')]

    for item in items:
        width, height = Image.open(data_dir+item).size
        item_size = {'filename': data_dir+item, 'width': width, 'height': height}
        sizes.append(item_size)
        i+=1
        if i%1000==0:
            print(i)
    image_sizes = pd.DataFrame(sizes)
    image_sizes.to_csv(image_size_file, index=False)

image_sizes = pd.read_csv(image_size_file)


In [8]:
# Use the DigitStructWrapper functionality 
from unpacker import DigitStructWrapper

def get_bounding_boxes(start_path = '.'):
    """ Extracts a bounding box file and returns a dictionary
    """
    return DigitStructWrapper(start_path).unpack_all()


In [9]:
# Check if we have already created bounding boxes
bbox_file = base_dir + process + "_bounding_boxes.csv"

if not os.path.isfile(bbox_file):
    bbox = get_bounding_boxes(data_dir+'digitStruct.mat')

    # Convert bbox to list of dicts
    new_bbox = []

    for box in bbox:
            # For every bounding box
            for item in box['boxes']:

                # Store a dict with the file and bounding box info
                new_bbox.append({
                        'filename': data_dir + box['filename'],
                        'label': int(item['label']),
                        'height_low': item['top'],
                        'height_high': item['top']+item['height'],
                        'width_low': item['left'],
                        'width_high': item['left']+item['width'],
                })

    # Convert new_bbox to DF
    df = pd.DataFrame(new_bbox)
    
    aggregate = {'width_low':'min',
                 'height_low':'min',
                 'width_high':'max',
                 'height_high':'max',
                 'label':{
                    'labels': lambda x: list(x),
                    'num_digits': 'count'}}

    # Apply the aggration
    df = df.groupby('filename').agg(aggregate).reset_index()

    # Fix the column names after aggregation
    df.columns = [x[0] if i < 5 else x[1] for i, x in enumerate(df.columns.values)]

    # Save new box
    df.to_csv(bbox_file, index=False)
        
box_df = pd.read_csv(bbox_file)



In [10]:
df = pd.merge(box_df, image_sizes, on='filename', how='inner')
df.loc[:,'labels'] = df.loc[:,'labels'].str.slice(1,-1).str.split(',')
df.head()

Unnamed: 0,filename,width_low,height_low,width_high,height_high,labels,num_digits,height,width
0,data/extra/1.png,24.0,23.0,126.0,126.0,"[4, 7, 8]",3,141,166
1,data/extra/10.png,5.0,5.0,52.0,32.0,"[4, 4, 4]",3,41,52
2,data/extra/100.png,3.0,1.0,46.0,30.0,"[5, 3, 5]",3,32,51
3,data/extra/1000.png,9.0,7.0,36.0,40.0,"[2, 6]",2,49,51
4,data/extra/10000.png,8.0,9.0,41.0,35.0,"[2, 10, 10]",3,44,55


In [11]:
# Correct bounding boxes not contained by image
df.loc[df['width_low'] < 0, 'width_low'] = 0
df.loc[df['height_low'] < 0, 'height_low'] = 0
df.loc[df['width_high'] > df['width'], 'width_high'] = df['width']
df.loc[df['height_high'] > df['height'], 'height_high'] = df['height']

df = df[df.num_digits < 5]

In [12]:
# Crop Images
from scipy.ndimage import imread
from scipy.misc import imresize

req_size = (32, 32)

X = np.zeros(shape=(df.shape[0], req_size[0], req_size[1], 3), dtype='uint8')
y = np.full((df.shape[0], 5), 10, dtype=int)

for i, (index, image) in enumerate(df.iterrows()):
    # Get the image data
    img = imread(image['filename'])
    cropped = img[int(image['height_low']):int(image['height_high']), int(image['width_low']):int(image['width_high']), :]
    X[i] = imresize(cropped, req_size)
    
    labels = np.array((image['labels']))

    labels[labels==10] = 0
    y[i,0:labels.shape[0]] = labels



`imread` is deprecated in SciPy 1.0.0.
Use ``matplotlib.pyplot.imread`` instead.
  if sys.path[0] == '':
`imresize` is deprecated in SciPy 1.0.0, and will be removed in 1.2.0.
Use ``skimage.transform.resize`` instead.
  


In [16]:
greyscale = np.dot(X, [0.2989, 0.5870, 0.1140])
X_grey = np.expand_dims(greyscale, axis=3).astype(np.float32)

# Create file
h5f = h5py.File(base_dir+process+'_multi_grey.h5', 'w')
h5f.create_dataset(process+'_dataset', data=X)
h5f.create_dataset(process+'_labels', data=y)

# Close the file
h5f.close()