In [1]:
import torch
from torch import Tensor
import dlc_practical_prologue as prologue
import numpy as np

### 1. Nearest Neighbor
Write a function that gets a training set and a test sample and returns the label of the training point
the closest to the latter.

In [2]:
def nearest_classification(train_input, train_target, x):
    diffs = train_input - x
    sqs  = torch.pow(diffs, 2)
    norms = torch.mean(sqs, 1)
    _, min_ind = norms.min(0)
    return train_target[min_ind]

### 2. Error estimation

In [3]:
def compute_nb_errors(train_input, train_target, test_input, test_target, mean = None, proj = None):
    if mean is not None:
        train_input -= mean
        test_input -= mean
    if proj is not None:
        train_input = train_input @ proj.t()
        test_input = test_input @ proj.t()
    # Count of misclassified points
    loss = 0
    for x, y in zip(test_input, test_target):
        loss += (nearest_classification(train_input, train_target, x) != y)
    return loss.item()

### 3. PCA

In [4]:
def PCA(x):
    mean = x.mean(0)
    b = x - mean
    sigma = b.t() @ b
    eigenvalues, eigenvectors = sigma.eig(True)
    ordered_eigs = eigenvalues[:, 0].abs().sort(0, True)[1]
    eigenvectors = eigenvectors.t()[ordered_eigs]
    return mean, eigenvectors

### 4. Evaluation

In [5]:
for c in [ False, True ]:
    train_input, train_target, test_input, test_target = prologue.load_data(cifar=c)

    nb_errors = compute_nb_errors(train_input, train_target, test_input, test_target)
    print('Baseline nb_errors {:d} error {:.02f}%'.format(nb_errors, 100 * nb_errors / test_input.size(0)))

    basis = train_input.new(100, train_input.size(1)).normal_()

    nb_errors = compute_nb_errors(train_input, train_target, test_input, test_target, None, basis)
    print('Random {:d}d nb_errors {:d} error {:.02f}%'.format(basis.size(0), nb_errors, 100 * nb_errors / test_input.size(0)))

    mean, basis = PCA(train_input)

    for d in [ 100, 50, 10, 3 ]:
        nb_errors = compute_nb_errors(train_input, train_target, test_input, test_target, mean, basis[:d])
        print('PCA {:d}d nb_errors {:d} error {:.02f}%'.format(d, nb_errors, 100 * nb_errors / test_input.size(0)))

* Using MNIST
** Reduce the data-set (use --full for the full thing)
** Use 1000 train and 1000 test samples
Baseline nb_errors 172 error 17.20%
Random 100d nb_errors 212 error 21.20%
PCA 100d nb_errors 164 error 16.40%
PCA 50d nb_errors 155 error 15.50%
PCA 10d nb_errors 214 error 21.40%
PCA 3d nb_errors 85 error 8.50%
* Using CIFAR
Files already downloaded and verified
Files already downloaded and verified
** Reduce the data-set (use --full for the full thing)
** Use 1000 train and 1000 test samples
Baseline nb_errors 234 error 23.40%
Random 100d nb_errors 11 error 1.10%
PCA 100d nb_errors 233 error 23.30%
PCA 50d nb_errors 225 error 22.50%
PCA 10d nb_errors 251 error 25.10%
PCA 3d nb_errors 71 error 7.10%
