In [None]:
def get_images_from_ids(query_result):
    return image_train.filter_by(query_result['reference_label'],'id')

In [None]:
cat_neighbors = get_images_from_ids(knn_model.query(cat))

In [None]:
cat_neighbors['image'].show()

Very cool results showing similar cats.

##Finding similar images to a car

In [None]:
car = image_train[8:9]
car['image'].show()

In [None]:
get_images_from_ids(knn_model.query(car))['image'].show()

# Just for fun, let's create a lambda to find and show nearest neighbor images

In [None]:
show_neighbors = lambda i: get_images_from_ids(knn_model.query(image_train[i:i+1]))['image'].show()

In [None]:
show_neighbors(8)

In [None]:
show_neighbors(26)

# Quiz

## Question 1
What is the least common category in the training data?

In [None]:
least = image_train.num_rows() + 1
for label in image_train['label'].unique():
    instances = image_train[image_train['label'] == label].num_rows()
    if instances < least:
        least = instances
        least_label = label
print(least_label, least)

In [None]:
print(image_train['label'].sketch_summary())

## Question 2

Create models for each of the categories shown in the summary.

### Separate the frame by category into separate frames

In [None]:
automobile_train = image_train[image_train['label'] == 'automobile']
cat_train = image_train[image_train['label'] == 'cat']
dog_train = image_train[image_train['label'] == 'dog']
bird_train = image_train[image_train['label'] == 'bird']
training = {'automobile': automobile_train,
            'cat': cat_train,
            'dog': dog_train,
            'bird': bird_train}

In [None]:
assert automobile_train.num_rows() == 509
assert cat_train.num_rows() == 509
assert dog_train.num_rows() == 509
assert bird_train.num_rows() == 478
for label in training:
    assert training[label]['label'].unique() == label

### Create a nearest-neighbors model for each category

In [None]:
automobile_model = graphlab.nearest_neighbors.create(automobile_train,
                                                         features=['deep_features'],
                                                         label='id')

In [None]:
models = {}
for label, training_data in training.items():
    models[label] =  graphlab.nearest_neighbors.create(training_data,
                                                      features=['deep_features'],
                                                      label='id')


### What is the nearest 'cat' labeled image in the training data to the first cat image in the `image_test` data?

In [None]:
image_test = graphlab.SFrame('image_test_data/')
first_cat = image_test[0:1]

In [None]:
first_cat['image'].show()

In [None]:
cat_neighbors = models['cat'].query(first_cat)
nearest_cat = cat_neighbors[cat_neighbors['rank'] == 1]
print(nearest_cat)    

In [None]:
images = get_images_from_ids(nearest_cat)
images['image'].show()

The nearest cat has reference label 16289.

### What is the nearest 'dog' labeled data in the training set to the same cat image?    

In [None]:
dog_neighbors = models['dog'].query(first_cat)
nearest_dog = dog_neighbors[dog_neighbors['rank'] == 1]
print(nearest_dog)

In [None]:
images = get_images_from_ids(nearest_dog)
images['image'].show()

The nearest dog has reference label 16976.

## Question 3
### Find the mean distance for the first 5 nearest neighbors in the cat model to the cat image used above.

In [None]:
cat_neighbors[:5]['distance'].mean()

### Now find the mean distance for the five nearest dog neighbors

In [None]:
dog_neighbors[:5]['distance'].mean()

On average the cat-neighbors are slightly closer to the first cat image than the dog-neighbors.

## Question 4 (Challenge Question)

### Split the image-test data into category-SFrames like you did with the traning data

In [None]:
testing = {label: image_test[image_test['label'] == label] for label in training}
for label in testing:
    assert testing[label]['label'].unique() == label

Using each of the testing sets, find the nearest neighbor to each of the models.

In [None]:
neighbors = {train_label: {test_label: models[train_label].query(testing[test_label], k=1) for test_label in testing} for train_label in training}    

This finds 1 nearest-neighbor for each of the images in each of the test-sets.

### How many of the dog images in the test set are closer to a dog in the training set than to another category of image?

#### Create an SFrame with the distances from the 'dog' test examples to the nearest neighbors in each of the training sets.

In [None]:
neighbors['cat']['dog'].head(3)

In [None]:
def get_distances(target='dog'):
    """
    :param: 
     - `target`: the name of the label in the test-data that you want to check
    :return: SFrame with distances between the given label and the other labels
    """    
    return graphlab.SFrame({"{0}-{1}".format(target, label): neighbors[label][target]['distance'] for label in neighbors})

In [None]:
dog_distances = get_distances()
for column in dog_distances.column_names():
    print(column, dog_distances[column].mean())

### Compute the number of correct predictions ('dog' label, k=1)

In [None]:
not_dog_dog =  ('dog-automobile', 'dog-cat', 'dog-bird')
    
def is_dog_correct(row):
    """
    :param:
     - `row`: one row in the sframe of distances
    :return: 1 if dog-dog distance is lower than other column distances
    """
    return all((row['dog-dog'] < row[label] for label in not_dog_dog))

for index, row in enumerate(dog_distances):
    if not is_dog_correct(row):
        break
assert any(dog_distances['dog-dog'][index] > dog_distances[label][index] for label in not_dog_dog)
# this (index -1 ) is safe because I know the first row passes
assert all(dog_distances['dog-dog'][index - 1] < dog_distances[label][index - 1] for label in not_dog_dog)

#### Calculate the number of rows that are correct
(as a check, it is given that using 'cat' instead of dog would give 548 correct classifications)

##### Cat-check

In [None]:
cat_distances = get_distances('cat')
others = tuple('cat-{0}'.format(label) for label in training if label != 'cat')
def is_cat_correct(row):
    """
    :param:
     - `row`: one row in the sframe of distances
    :return: 1 if dog-dog distance is lower than other column distances
    """
    return all((row['cat-cat'] < row[label] for label in others))
expected = 548
cat_correct = cat_distances.apply(is_cat_correct).sum()    
assert expected == cat_correct

In [None]:
print('the model had an accuracy of {0:.2f} for cat-classification.'.format(float(cat_correct)/testing['cat'].num_rows()))

##### And now the dogs

In [None]:
correct = dog_distances.apply(is_dog_correct).sum()
print('There were {0} correct dog classifications'.format(correct))

### What is the accuracy for the dog classifications?