In [None]:
# Third-party packages
import h5py
import matplotlib.pyplot as pl
%matplotlib inline
import numpy as np
from sklearn.linear_model import LogisticRegression

# this package
from astronn.data import fetch_notMNIST

In [None]:
cache_file = fetch_notMNIST()

---
Problem 1
---------

Random images

In [None]:
label_map = list('abcdefghij')

In [None]:
fig,axes = pl.subplots(3,3,figsize=(5,5),sharex=True,sharey=True)
with h5py.File(cache_file, 'r') as f:
    for i in range(9):
        ax = axes.flat[i]
        
        idx = np.random.randint(f['test']['images'].shape[0])
        ax.imshow(f['test']['images'][idx],
                  cmap='Greys', interpolation='nearest')
        ax.set_title(label_map[int(f['test']['labels'][idx])])

---
Problem 2
---------

Mean images

In [None]:
# Solution: 
with h5py.File(cache_file, 'r') as f:
    # get a unique list of the classes
    classes = np.unique(f['test']['labels'])
    classes.sort()
    nclasses = len(classes)
    
    images = f['test']['images'][:]
    for i,cls in enumerate(classes):
        fig,ax = pl.subplots(1,1,figsize=(2,2))
        mean_img = images[f['test']['labels'][:] == cls].mean(axis=0) # select all images for a given class, take mean
        ax.imshow(mean_img, cmap='Greys', interpolation='nearest') # greyscale colormap, no interpolation
        ax.set_title(label_map[i])

---
Problem 3
---------

Randomize data

In [None]:
def randomize(data, labels):
    permutation = np.random.permutation(labels.shape[0])
    shuffled_data = data[permutation]
    shuffled_labels = labels[permutation]
    return shuffled_data, shuffled_labels

with h5py.File(cache_file, 'r') as f:
    train_dataset, train_labels = randomize(f['train']['images'][:], f['train']['labels'][:])
    test_dataset, test_labels = randomize(f['test']['images'][:], f['test']['labels'][:])

---
Problem 4
---------
Number per class

In [None]:
np.histogram(train_labels, bins=np.arange(0,nclasses+1,1))

OK, so there are about 50000 in each class in the training set

In [None]:
np.histogram(test_labels, bins=np.arange(0,nclasses+1,1))

And about 1870 in each class in the test set

---
Problem 5
---------
How much overlap is there between training, validation and test samples?

In [None]:
n_overlaps = []
# the data has been randomize, so let's just check the first 100 images and assume that
#    is a representative sample
for test_img in test_dataset[:100]: 
    diff = (train_dataset - test_img[None]).sum(axis=-1).sum(axis=-1)
    n_overlap = (diff == 0).sum()
    n_overlaps.append(n_overlap)

In [None]:
print("Typical overlap:", np.median(n_overlaps))
pl.hist(n_overlaps)

What about near duplicates between datasets? (images that are almost identical)

In [None]:
n_overlaps = []
threshold = 1E-2 # define an arbitrary threshold -- play with this

# the data has been randomize, so let's just check the first 100 images and assume that
#    is a representative sample
for test_img in test_dataset[:100]: 
    diff = (train_dataset - test_img[None]).sum(axis=-1).sum(axis=-1)
    n_overlap = (np.abs(diff) < threshold).sum()
    n_overlaps.append(n_overlap)

---
Problem 6
---------

Train a logistic regressor on the image data using 50, 100, 1000 and 5000 training samples. 

In [None]:
model = LogisticRegression()

In [None]:
image_size = train_dataset.shape[-1]
subset = 50 # replace with 100, 1000, 5000

idx = np.random.choice(np.arange(train_dataset.shape[0]), size=subset)
train_subset_data = train_dataset[idx].reshape(subset, image_size*image_size)
train_subset_labels = train_labels[idx]

In [None]:
model.fit(train_subset_data, train_subset_labels)

In [None]:
predict_labels = model.predict(test_dataset.reshape(test_dataset.shape[0], image_size*image_size))

In [None]:
(predict_labels != test_labels).sum() / float(test_labels.size)