In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import gzip, binascii, struct, numpy
import numpy as np

We will train the classifier to recognize both MNIST numbers and NON-MNIST letters. Our training plan is approximately as follows:
1) Load mnist data
2) Load non-mnist data
3) Merge both data sources into a single array
4) Merge their label into a single array
5) Split each training data set training and test.
5.1) Merge training data sets for both mnist and non-mnist
5.2) Merge test datasets
5.3) Add "empty" training set
5.4) Add "empty" test set
6) Build Tensorf-flow model
7) Train it.

In [2]:
directory = 'mnist'
train_data_filename = directory + '/train-images-idx3-ubyte.gz'
train_labels_filename = directory + '/train-labels-idx1-ubyte.gz'
test_data_filename = directory + '/t10k-images-idx3-ubyte.gz'
test_labels_filename = directory + '/t10k-labels-idx1-ubyte.gz'

## Load MNIST dataset
 Extract the images into a 4D tensor [image index, y, x, channels]. For greyscale MNIST, the number of channels is always 1. Values are rescaled from [0, 255] down to [-0.5, 0.5].

First, images:

In [3]:
IMAGE_SIZE = 28
PIXEL_DEPTH = 255

def extract_data(filename, num_images):
  print ('Extracting', filename)
  with gzip.open(filename) as bytestream:
    # Skip the magic number and dimensions; we know these values.
    bytestream.read(16)
    
    buf = bytestream.read(IMAGE_SIZE * IMAGE_SIZE * num_images)
    data = numpy.frombuffer(buf, dtype=numpy.uint8).astype(numpy.float32)
    data = (data - (PIXEL_DEPTH / 2.0)) / PIXEL_DEPTH
    data = data.reshape(num_images, IMAGE_SIZE, IMAGE_SIZE, 1)
    return data

train_data = extract_data(train_data_filename, 60000)
test_data = extract_data(test_data_filename, 10000)
train_data.shape
#plt.imshow(train_data[0].reshape(28, 28))

Extracting mnist/train-images-idx3-ubyte.gz
Extracting mnist/t10k-images-idx3-ubyte.gz


(60000, 28, 28, 1)

Now, load MNIST labels

In [4]:
NUM_LABELS = 20

def extract_labels(filename, num_images):
  """Extract the labels into a 1-hot matrix [image index, label index]."""
  print ('Extracting', filename)
  with gzip.open(filename) as bytestream:
    # Skip the magic number and count; we know these values.
    bytestream.read(8)
    
    buf = bytestream.read(1 * num_images)
    labels = numpy.frombuffer(buf, dtype=numpy.uint8)
  # Convert to dense 1-hot representation.
  return (numpy.arange(NUM_LABELS) == labels[:, None]).astype(numpy.float32)

train_labels = extract_labels(train_labels_filename, 60000)
test_labels = extract_labels(test_labels_filename, 10000)
print ('Training labels shape', train_labels.shape)
print ('First label vector', train_labels[0])
print ('Second label vector', train_labels[1])

Extracting mnist/train-labels-idx1-ubyte.gz
Extracting mnist/t10k-labels-idx1-ubyte.gz
Training labels shape (60000, 20)
First label vector [ 0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.]
Second label vector [ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.]


### Split the training data into 2 sets: training , validation

In [5]:
VALIDATION_SIZE = 5000

validation_data = train_data[:VALIDATION_SIZE, :, :, :]
validation_labels = train_labels[:VALIDATION_SIZE]
train_data = train_data[VALIDATION_SIZE:, :, :, :]
train_labels = train_labels[VALIDATION_SIZE:]

train_size = train_labels.shape[0]

print ('Validation shape', validation_data.shape)
print ('Train size', train_size)

Validation shape (5000, 28, 28, 1)
Train size 55000


## Load NON-MNIST Dataset

In [6]:
import pickle

In [7]:
pickle_file = directory + '/notMNIST.pickle'
f = open(pickle_file, 'rb')
data = pickle.load(f)
f.close()

nm_train_data = data['train_dataset']
nm_train_labels = data['train_labels']

nm_valid_data = data['valid_dataset']
nm_valid_labels = data['valid_labels']

nm_test_dataset = data['test_dataset']
nm_test_labels = data['test_labels']

Reshape data array to match that of MNIST. Currently it's [N, h, w]. We want [N, h, w, channels], where in this case the channels == 1

In [8]:
nm_train_data = nm_train_data.reshape(-1, 28, 28, 1)

Shift labels by 10. The first 10 indexes are for MNIST numbers

In [9]:
shift_by = 10
nm_train_labels_real_adj = nm_train_labels + shift_by
nm_valid_labels_adj = nm_valid_labels + shift_by
nm_test_labels_adj = nm_test_labels + shift_by

Convert labels into one-hot encoding similar to MNIST

In [10]:
def to_one_hot(real_labels):
    return (numpy.arange(NUM_LABELS) == real_labels[:, None]).astype(np.float32)
nm_train_labels = to_one_hot(nm_train_labels_real_adj)
nm_valid_labels = to_one_hot(nm_valid_labels_adj)
nm_test_labels = to_one_hot(nm_test_labels_adj)

Merge data and label arrays