In [None]:
def make_arrays(nb_rows, img_size):
  if nb_rows:
    dataset = np.ndarray((nb_rows, img_size, img_size), dtype=np.float32)
    labels = np.ndarray(nb_rows, dtype=np.int32)
  else:
    dataset, labels = None, None
  return dataset, labels

def merge_datasets(pickle_files, train_size, valid_size=0):
  num_classes = len(pickle_files)
  valid_dataset, valid_labels = make_arrays(valid_size, image_size)
  train_dataset, train_labels = make_arrays(train_size, image_size)
  vsize_per_class = valid_size // num_classes
  tsize_per_class = train_size // num_classes
    
  start_v, start_t = 0, 0
  end_v, end_t = vsize_per_class, tsize_per_class
  end_l = vsize_per_class+tsize_per_class
  for label, pickle_file in enumerate(pickle_files):       
    try:
      with open(pickle_file, 'rb') as f:
        letter_set = pickle.load(f)
        # let's shuffle the letters to have random validation and training set
        np.random.shuffle(letter_set)
        if valid_dataset is not None:
          valid_letter = letter_set[:vsize_per_class, :, :]
          valid_dataset[start_v:end_v, :, :] = valid_letter
          valid_labels[start_v:end_v] = label
          start_v += vsize_per_class
          end_v += vsize_per_class
                    
        train_letter = letter_set[vsize_per_class:end_l, :, :]
        train_dataset[start_t:end_t, :, :] = train_letter
        train_labels[start_t:end_t] = label
        start_t += tsize_per_class
        end_t += tsize_per_class
    except Exception as e:
      print('Unable to process data from', pickle_file, ':', e)
      raise
    
  return valid_dataset, valid_labels, train_dataset, train_labels
            
            
train_size = 200000
valid_size = 10000
test_size = 10000

valid_dataset, valid_labels, train_dataset, train_labels = merge_datasets(
  train_datasets, train_size, valid_size)
_, _, test_dataset, test_labels = merge_datasets(test_datasets, test_size)

print('Training:', train_dataset.shape, train_labels.shape)
print('Validation:', valid_dataset.shape, valid_labels.shape)
print('Testing:', test_dataset.shape, test_labels.shape)

Next, we'll randomize the data. It's important to have the labels well shuffled for the training and test distributions to match.

In [None]:
def randomize(dataset, labels):
  permutation = np.random.permutation(labels.shape[0])
  shuffled_dataset = dataset[permutation,:,:]
  shuffled_labels = labels[permutation]
  return shuffled_dataset, shuffled_labels
train_dataset, train_labels = randomize(train_dataset, train_labels)
test_dataset, test_labels = randomize(test_dataset, test_labels)
valid_dataset, valid_labels = randomize(valid_dataset, valid_labels)

---
Problem 4
---------
Convince yourself that the data is still good after shuffling!

---

In [None]:
# Let's print 12 random samples for the dataset and check they correspond to their labels.
def check_consistency(samples,labels,title):
    if len(samples) != len(labels):
        print ("Samples and labels should have same length.")
        return
    letters = 'ABCDEFGHIJ'
    indexes = np.random.randint(len(samples),size=12)
    fig = plt.figure(figsize=(12, 3), dpi=80)
    fig.suptitle(title,fontsize=14)
    for i,index in enumerate(indexes):
        plt.subplot(1,12,i+1)
        plt.title(letters[labels[index]])
        plt.imshow(samples[index],interpolation='nearest',cmap='Greys')
    plt.tight_layout()

check_consistency(train_dataset,train_labels,"Training set")
check_consistency(valid_dataset,valid_labels,"Validation set")

Finally, let's save the data for later reuse:

In [None]:
pickle_file = os.path.join(data_folder,'notMNIST.pickle')

try:
  f = open(pickle_file, 'wb')
  save = {
    'train_dataset': train_dataset,
    'train_labels': train_labels,
    'valid_dataset': valid_dataset,
    'valid_labels': valid_labels,
    'test_dataset': test_dataset,
    'test_labels': test_labels,
    }
  pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
  f.close()
except Exception as e:
  print('Unable to save data to', pickle_file, ':', e)
  raise

In [None]:
statinfo = os.stat(pickle_file)
print('Compressed pickle size:', statinfo.st_size)

---
Problem 5
---------

By construction, this dataset might contain a lot of overlapping samples, including training data that's also contained in the validation and test set! Overlap between training and test can skew the results if you expect to use your model in an environment where there is never an overlap, but are actually ok if you expect to see training samples recur when you use it.
Measure how much overlap there is between training, validation and test samples.

Optional questions:
- What about near duplicates between datasets? (images that are almost identical)
- Create a sanitized validation and test set, and compare your accuracy on those in subsequent assignments.
---

In [None]:
import time
import hashlib
def clean_overlap(train_set,test_set,test_labels,valid_set,valid_labels):
    t1 = time.time()
    #create arrays with hashes for each image
    train_hashes = [hashlib.sha1(x).digest() for x in train_set]
    valid_hashes = [hashlib.sha1(x).digest() for x in valid_set]
    test_hashes  = [hashlib.sha1(x).digest() for x in test_set]

    #Create arrays containing True values for indexes in common
    valid_in_train = np.in1d(valid_hashes, train_hashes)
    test_in_train  = np.in1d(test_hashes,  train_hashes)
    test_in_valid  = np.in1d(test_hashes,  valid_hashes)

    #Create arrays containing indexes (as True values) of elements to keep in test and validation sets
    valid_keep = ~valid_in_train
    test_keep  = ~(test_in_train | test_in_valid)

    #Create clean sets
    valid_dataset_clean = valid_dataset[valid_keep]
    valid_labels_clean  = valid_labels [valid_keep]

    test_dataset_clean = test_dataset[test_keep]
    test_labels_clean  = test_labels [test_keep]
    t2 = time.time()
    print("Time: %0.2fs" % (t2 - t1))
    print("valid -> train overlap: %d samples" % valid_in_train.sum())
    print("test  -> train overlap: %d samples" % test_in_train.sum())
    print("test  -> valid overlap: %d samples" % test_in_valid.sum())
    print('Clean validation set:', valid_dataset_clean.shape, valid_labels_clean.shape)
    print('Clean test set:', test_dataset_clean.shape, test_labels_clean.shape)

    return valid_dataset_clean,valid_labels_clean,test_dataset_clean,test_labels_clean
    

In [None]:
valid_clean_data, valid_clean_labels, test_clean_data, test_clean_labels = clean_overlap(train_dataset,test_dataset,test_labels,valid_dataset,valid_labels)

To find near duplicates, a cheap and fast way would be to round numbers to, say, one decimal, so similar images will end up the same.

In [None]:
rounded_train = [np.round(x,1) for x in train_dataset]
rounded_test = [np.round(x,1) for x in test_dataset]
rounded_valid = [np.round(x,1) for x in valid_dataset]
valid_clean_rounded, valid_clean_rounded_labels, test_clean_rounded, test_clean_rounded_labels = clean_overlap(rounded_train,rounded_test,test_labels,rounded_valid,valid_labels)

---
Problem 6
---------

Let's get an idea of what an off-the-shelf classifier can give you on this data. It's always good to check that there is something to learn, and that it's a problem that is not so trivial that a canned solution solves it.

Train a simple model on this data using 50, 100, 1000 and 5000 training samples. Hint: you can use the LogisticRegression model from sklearn.linear_model.

Optional question: train an off-the-shelf model on all the data!

---

In [None]:
from sklearn import linear_model

def trainLR(train_data,train_labels,test_data,test_labels):
    t1 = time.time()
    train_data_r = train_data.reshape((len(train_data),28*28))
    test_data_r = test_data.reshape((len(test_data),28*28))
    clf = linear_model.LogisticRegression()
    clf.fit(train_data_r,train_labels)
    score = clf.score(test_data_r,test_labels)
    t2 = time.time()
    return score,(t2-t1)
    


In [None]:
print ("Training and validating on the original dataset")
s50,t50 = trainLR(train_dataset[0:50],train_labels[0:50],valid_dataset,valid_labels)
print (s50,t50)
s100,t100 = trainLR(train_dataset[0:100],train_labels[0:100],valid_dataset,valid_labels)
print (s100,t100)
s1000,t1000 = trainLR(train_dataset[0:1000],train_labels[0:1000],valid_dataset,valid_labels)
print (s1000,t1000)
s5000,t5000 = trainLR(train_dataset[0:5000],train_labels[0:5000],valid_dataset,valid_labels)
print (s5000,t5000)

In [None]:
print ("Training and validating on the set")
s50,t50 = trainLR(train_dataset[0:50],train_labels[0:50],valid_clean_data,valid_clean_labels)
print (s50,t50)
s100,t100 = trainLR(train_dataset[0:100],train_labels[0:100],valid_clean_data,valid_clean_labels)
print (s100,t100)
s1000,t1000 = trainLR(train_dataset[0:1000],train_labels[0:1000],valid_clean_data,valid_clean_labels)
print (s1000,t1000)
s5000,t5000 = trainLR(train_dataset[0:5000],train_labels[0:5000],valid_clean_data,valid_clean_labels)
print (s5000,t5000)

We observe that the score is lower when removing the overlap, which makes sense: samples from the training set also present in the validation set will increase the accuracy of the predictions