In [1]:
import matplotlib.pyplot as plt
from workspace_utils import active_session
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from models import Net

from torch.utils.data import DataLoader , Dataset
from torchvision import transforms , utils
%load_ext autoreload
%autoreload 2

In [2]:
## Define the Net in models.py
## Once you've define the network, you can instantiate it
# one example conv layer has been provided for you
net = Net()
print(net)

Net(
  (conv1): Conv2d(1, 32, kernel_size=(5, 5), stride=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (conv3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1))
  (conv4): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1))
  (conv5): Conv2d(256, 512, kernel_size=(1, 1), stride=(1, 1))
  (fc1): Linear(in_features=18432, out_features=1024, bias=True)
  (fc2): Linear(in_features=1024, out_features=512, bias=True)
  (fc3): Linear(in_features=512, out_features=136, bias=True)
  (dropout): Dropout(p=0.25, inplace=False)
)


## Transform the dataset 

To prepare for training, we have created a transformed dataset of images and keypoints.

### Define a data transform

In PyTorch, a convolutional neural network expects a torch image of a consistent size as input. For efficient training, and so our model's loss does not blow up during training, it is also suggested that we normalize the input images and keypoints. The necessary transforms have been defined in `data_load.py` and we **do not** need to modify these.

To define the data transform below, we have used a [composition](http://pytorch.org/tutorials/beginner/data_loading_tutorial.html#compose-transforms) of:
1. Rescaling and/or cropping the data, such that we are left with a square image (the suggested size is 224x224px)
2. Normalizing the images and keypoints; turning each RGB image into a grayscale image with a color range of [0, 1] and transforming the given keypoints into a range of [-1, 1]
3. Turning these images and keypoints into Tensors

**This transform will be applied to the training data and, later, the test data**. It will change how we go about displaying these images and keypoints, but these steps are essential for efficient training.


In [4]:
from Load import FacialKeypointsDataset
from Load import Rescale , RandomCrop , Normalize , ToTensor
data_transform = transforms.Compose([Rescale(250),
                                    RandomCrop(224),
                                    Normalize(),
                                    ToTensor()])

# testing that you've defined a transform
assert(data_transform is not None), 'Define a data_transform'

In [10]:
# create the transformed dataset
transformed_dataset = FacialKeypointsDataset(csv_file='data/training_frames_keypoints.csv',
                                             root_dir='data/training/',
                                             transform=data_transform)


print('Number of images: ', len(transformed_dataset))

# iterate through the transformed dataset and print some stats about the first few samples
for i in range(4):
    sample = transformed_dataset[i]
    print(i, sample['image'].size(), sample['keypoints'].size())

Number of images:  3462
0 torch.Size([1, 24, 115]) torch.Size([68, 2])
1 torch.Size([1, 77, 146]) torch.Size([68, 2])
2 torch.Size([1, 106, 71]) torch.Size([68, 2])
3 torch.Size([1, 149, 125]) torch.Size([68, 2])


## Batching and loading data

Next, having defined the transformed dataset, we can use PyTorch's DataLoader class to load the training data in batches of whatever size as well as to shuffle the data for training the model. You can read more about the parameters of the DataLoader in [this documentation](http://pytorch.org/docs/master/data.html).

#### Batch size
Decide on a good batch size for training your model. Try both small and large batch sizes and note how the loss decreases as the model trains. Too large a batch size may cause your model to crash and/or run out of memory while training.

**Note for Windows users**: Please change the `num_workers` to 0 or you may face some issues with your DataLoader failing.

In [16]:
# load training data in batches
batch_size = 1

train_loader = DataLoader(transformed_dataset, 
                          batch_size=batch_size,
                          shuffle=True, 
                          num_workers=0)
for item in train_loader:
    print(item)

{'image': tensor([[[[1.6657e-04, 8.1730e-05, 5.1762e-05,  ..., 1.6602e-04,
           1.7463e-04, 1.7472e-04],
          [1.7204e-04, 9.9935e-05, 8.3434e-05,  ..., 1.5433e-04,
           1.6679e-04, 1.6858e-04],
          [1.7249e-04, 7.1911e-05, 4.7295e-05,  ..., 1.3649e-04,
           1.3649e-04, 1.3649e-04],
          ...,
          [2.8859e-04, 2.8910e-04, 2.8909e-04,  ..., 1.7569e-03,
           1.8124e-03, 1.8473e-03],
          [2.7890e-04, 2.8394e-04, 2.8227e-04,  ..., 1.4578e-03,
           1.6551e-03, 1.8323e-03],
          [2.6475e-04, 2.6475e-04, 2.7636e-04,  ..., 1.2775e-03,
           1.3026e-03, 1.4614e-03]]]]), 'keypoints': tensor([[[-3.4061, -2.6481],
         [-3.3763, -2.5128],
         [-3.3763, -2.3926],
         [-3.3465, -2.2574],
         [-3.2869, -2.1072],
         [-3.1827, -1.9719],
         [-3.0635, -1.8818],
         [-2.9294, -1.7766],
         [-2.8102, -1.7165],
         [-2.6761, -1.7466],
         [-2.6463, -1.7766],
         [-2.6463, -1.8367],
    

ValueError: high <= 0

## Before training

Take a look at how this model performs before it trains. You should see that the keypoints it predicts start off in one spot and don't match the keypoints on a face at all! It's interesting to visualize this behavior so that you can compare it to the model after training and see how the model has improved.

#### Load in the test dataset

The test dataset is one that this model has *not* seen before, meaning it has not trained with these images. We'll load in this test data and before and after training, see how our model performs on this set!

To visualize this test data, we have to go through some un-transformation steps to turn our images into python images from tensors and to turn our keypoints back into a recognizable range. 

In [22]:
# load in the test data, using the dataset class
# AND apply the data_transform you defined above

# create the test dataset
test_dataset = FacialKeypointsDataset(csv_file='data/test_frames_keypoints.csv',
                                             root_dir='data/test/',
                                             transform=data_transform)



In [23]:
# load test data in batches
batch_size = 1

test_loader = DataLoader(test_dataset, 
                          batch_size=batch_size,
                          shuffle=True, 
                          num_workers=4)

## Apply the model on a test sample

To test the model on a test sample of data, we have to follow these steps:
1. Extract the image and ground truth keypoints from a sample
2. Wrap the image in a Variable, so that the net can process it as input and track how it changes as the image moves through the network.
3. Make sure the image is a FloatTensor, which the model expects.
4. Forward pass the image through the net to get the predicted, output keypoints.

This function test how the network performs on the first batch of test data. It returns the images, the transformed images, the predicted keypoints (produced by the model), and the ground truth keypoints.

In [24]:
# test the model on a batch of test images

def net_sample_output():
    
    # iterate through the test dataset
    for i, sample in enumerate(test_loader):
        
        # get sample data: images and ground truth keypoints
        images = sample['image']
        key_pts = sample['keypoints']

        # convert images to FloatTensors
        images = images.type(torch.FloatTensor)

        # forward pass to get net output
        output_pts = net(images)
        
        # reshape to batch_size x 68 x 2 pts
        output_pts = output_pts.view(output_pts.size()[0], 68, -1)
        
        # break after first image is tested
        if i == 0:
            return images, output_pts, key_pts
            

#### Debugging tips

If you get a size or dimension error here, make sure that your network outputs the expected number of keypoints! Or if you get a Tensor type error, look into changing the above code that casts the data into float types: `images = images.type(torch.FloatTensor)`.

In [25]:
# call the above function
# returns: test images, test predicted keypoints, test ground truth keypoints
test_images, test_outputs, gt_pts = net_sample_output()

# print out the dimensions of the data to see if they make sense
print(test_images.data.size())
print(test_outputs.data.size())
print(gt_pts.size())

[W NNPACK.cpp:79] Could not initialize NNPACK! Reason: Unsupported hardware.


RuntimeError: Given input size: (128x12x1). Calculated output size: (128x6x0). Output size is too small

## Visualize the predicted keypoints

Once we've had the model produce some predicted output keypoints, we can visualize these points in a way that's similar to how we've displayed this data before, only this time, we have to "un-transform" the image/keypoint data to display it.

The *new* function, `show_all_keypoints` displays a grayscale image, its predicted keypoints and its ground truth keypoints (if provided).

In [None]:
def show_all_keypoints(image, predicted_key_pts, gt_pts=None):
    """Show image with predicted keypoints"""
    
    # image is grayscale
    plt.imshow(image, cmap='gray')
    plt.scatter(predicted_key_pts[:, 0], predicted_key_pts[:, 1], s=20, marker='.', c='m')
    # plot ground truth points as green pts
    if gt_pts is not None:
        plt.scatter(gt_pts[:, 0], gt_pts[:, 1], s=20, marker='.', c='g')


#### Un-transformation

Next, you'll see a helper function. `visualize_output` that takes in a batch of images, predicted keypoints, and ground truth keypoints and displays a set of those images and their true/predicted keypoints.

This function's main role is to take batches of image and keypoint data (the input and output of your CNN), and transform them into numpy images and un-normalized keypoints (x, y) for normal display. The un-transformation process turns keypoints and images into numpy arrays from Tensors *and* it undoes the keypoint normalization done in the Normalize() transform; it's assumed that you applied these transformations when you loaded your test data.

In [None]:
# visualize the output
# by default this shows a batch of 10 images
def visualize_output(test_images, test_outputs, gt_pts=None, batch_size=10):

    for i in range(batch_size):
        plt.figure(figsize=(20,10))
        ax = plt.subplot(1, batch_size, i+1)

        # un-transform the image data
        image = test_images[i].data   # get the image from it's Variable wrapper
        image = image.numpy()   # convert to numpy array from a Tensor
        image = np.transpose(image, (1, 2, 0))   # transpose to go from torch to numpy image

        # un-transform the predicted key_pts data
        predicted_key_pts = test_outputs[i].data
        predicted_key_pts = predicted_key_pts.numpy()
        # undo normalization of keypoints  
        predicted_key_pts = predicted_key_pts*50.0+100
        
        # plot ground truth points for comparison, if they exist
        ground_truth_pts = None
        if gt_pts is not None:
            ground_truth_pts = gt_pts[i]         
            ground_truth_pts = ground_truth_pts*50.0+100
        
        # call show_all_keypoints
        show_all_keypoints(np.squeeze(image), predicted_key_pts, ground_truth_pts)
            
        plt.axis('off')

    plt.show()
    
# call it
visualize_output(test_images, test_outputs, gt_pts)

## Training

#### Loss function
Training a network to predict keypoints is different than training a network to predict a class; instead of outputting a distribution of classes and using cross entropy loss, we have to choose a loss function that is suited for regression, which directly compares a predicted value and target value. Read about the various kinds of loss functions (like MSE or L1/SmoothL1 loss) in [this documentation](http://pytorch.org/docs/master/_modules/torch/nn/modules/loss.html).

### Define the loss and optimization

Next, we will define how the model will train by deciding on the loss function and optimizer.

---

In [None]:
## Define the loss and optimization
import torch.optim as optim

criterion = nn.MSELoss()
optimizer = optim.Adam(net.parameters(), lr = 0.001)


## Training and Initial Observation

Now, we will train on our batched training data from `train_loader` for a number of epochs. 

In [9]:
def train_net(n_epochs):

    # prepare the net for training
    net.train()
    training_loss = []

    for epoch in range(n_epochs):  # loop over the dataset multiple times
        
        running_loss = 0.0

        # train on batches of data, assumes you already have train_loader
        for batch_i, data in enumerate(train_loader):
            # get the input images and their corresponding labels
            images = data['image']
            key_pts = data['keypoints']

            # flatten pts
            key_pts = key_pts.view(key_pts.size(0), -1)

            # convert variables to floats for regression loss
            key_pts = key_pts.type(torch.FloatTensor)
            images = images.type(torch.FloatTensor)

            # forward pass to get outputs
            output_pts = net(images)

            # calculate the loss between predicted and target keypoints
            loss = criterion(output_pts, key_pts)

            # zero the parameter (weight) gradients
            optimizer.zero_grad()
            
            # backward pass to calculate the weight gradients
            loss.backward()

            # update the weights
            optimizer.step()

            # print loss statistics
            running_loss += loss.item()
            if batch_i % 10 == 9:    # print every 10 batches
                print('Epoch: {}, Batch: {}, Avg. Loss: {}'.format(epoch + 1, batch_i+1, running_loss/10))
                running_loss = 0.0
        training_loss.append(running_loss)

    print('Finished Training')
    return training_loss


In [10]:
# train your network
# n_epochs = 10 # start small, and increase when you've decided on your model structure and hyperparams

# this is a Workspaces-specific context manager to keep the connection
# alive while training your model, not part of pytorch
training_loss = train_net(10)


Epoch: 1, Batch: 10, Avg. Loss: 0.3359374165534973
Epoch: 1, Batch: 20, Avg. Loss: 0.2519407905638218
Epoch: 1, Batch: 30, Avg. Loss: 0.19335664361715316
Epoch: 1, Batch: 40, Avg. Loss: 0.22232168912887573
Epoch: 1, Batch: 50, Avg. Loss: 0.20937469676136972
Epoch: 1, Batch: 60, Avg. Loss: 0.17770689800381662
Epoch: 1, Batch: 70, Avg. Loss: 0.22189381569623948
Epoch: 1, Batch: 80, Avg. Loss: 0.22324044555425643
Epoch: 1, Batch: 90, Avg. Loss: 0.2096339724957943
Epoch: 1, Batch: 100, Avg. Loss: 0.2514382854104042
Epoch: 1, Batch: 110, Avg. Loss: 0.19009072184562684
Epoch: 1, Batch: 120, Avg. Loss: 0.17226141765713693
Epoch: 1, Batch: 130, Avg. Loss: 0.1690574675798416
Epoch: 1, Batch: 140, Avg. Loss: 0.210720930993557
Epoch: 1, Batch: 150, Avg. Loss: 0.14154042899608613
Epoch: 1, Batch: 160, Avg. Loss: 0.24603203684091568
Epoch: 1, Batch: 170, Avg. Loss: 0.21118866205215453
Epoch: 1, Batch: 180, Avg. Loss: 0.21770252510905266
Epoch: 1, Batch: 190, Avg. Loss: 0.20987769737839698
Epoch: 1,

Epoch: 5, Batch: 220, Avg. Loss: 0.2171165056526661
Epoch: 5, Batch: 230, Avg. Loss: 0.1657769113779068
Epoch: 5, Batch: 240, Avg. Loss: 0.19373459070920945
Epoch: 5, Batch: 250, Avg. Loss: 0.18331035524606704
Epoch: 5, Batch: 260, Avg. Loss: 0.20340466722846032
Epoch: 5, Batch: 270, Avg. Loss: 0.20506639033555984
Epoch: 5, Batch: 280, Avg. Loss: 0.19067778289318085
Epoch: 5, Batch: 290, Avg. Loss: 0.17561472356319427
Epoch: 5, Batch: 300, Avg. Loss: 0.3288019984960556
Epoch: 5, Batch: 310, Avg. Loss: 0.17356661334633827
Epoch: 5, Batch: 320, Avg. Loss: 0.17975733652710915
Epoch: 5, Batch: 330, Avg. Loss: 0.17582921609282492
Epoch: 5, Batch: 340, Avg. Loss: 0.20502685159444808
Epoch: 6, Batch: 10, Avg. Loss: 0.2084253467619419
Epoch: 6, Batch: 20, Avg. Loss: 0.22085311263799667
Epoch: 6, Batch: 30, Avg. Loss: 0.18563256338238715
Epoch: 6, Batch: 40, Avg. Loss: 0.16427772715687752
Epoch: 6, Batch: 50, Avg. Loss: 0.19524946212768554
Epoch: 6, Batch: 60, Avg. Loss: 0.1444060578942299
Epoc

Epoch: 10, Batch: 80, Avg. Loss: 0.21363032162189483
Epoch: 10, Batch: 90, Avg. Loss: 0.19739335998892785
Epoch: 10, Batch: 100, Avg. Loss: 0.22429664358496665
Epoch: 10, Batch: 110, Avg. Loss: 0.15968615859746932
Epoch: 10, Batch: 120, Avg. Loss: 0.17639247700572014
Epoch: 10, Batch: 130, Avg. Loss: 0.180033440887928
Epoch: 10, Batch: 140, Avg. Loss: 0.15491386577486993
Epoch: 10, Batch: 150, Avg. Loss: 0.17513110041618346
Epoch: 10, Batch: 160, Avg. Loss: 0.19761365056037902
Epoch: 10, Batch: 170, Avg. Loss: 0.15280862003564835
Epoch: 10, Batch: 180, Avg. Loss: 0.2037037879228592
Epoch: 10, Batch: 190, Avg. Loss: 0.2092262625694275
Epoch: 10, Batch: 200, Avg. Loss: 0.2349842943251133
Epoch: 10, Batch: 210, Avg. Loss: 0.16918335258960723
Epoch: 10, Batch: 220, Avg. Loss: 0.1955057203769684
Epoch: 10, Batch: 230, Avg. Loss: 0.22907571494579315
Epoch: 10, Batch: 240, Avg. Loss: 0.2003783881664276
Epoch: 10, Batch: 250, Avg. Loss: 0.23790448978543283
Epoch: 10, Batch: 260, Avg. Loss: 0.1

In [None]:
# visualize the loss as the network trained
plt.figure()
plt.semilogy(training_loss)
plt.grid()
plt.xlabel('Epoch')
plt.ylabel('Loss');

## Test data

See how the model performs on previously unseen, test data. We've already loaded and transformed this data, similar to the training data. Next, run the trained model on these images to see what kind of keypoints are produced.

In [None]:
# get a sample of test data again
test_images, test_outputs, gt_pts = net_sample_output()

print(test_images.data.size())
print(test_outputs.data.size())
print(gt_pts.size())

In [None]:
## visualize  test output
# you can use the same function as before, by un-commenting the line below:

visualize_output(test_images, test_outputs, gt_pts)


Once we have found a good model (or two), we have to save the model so we can load it and use it later!

In [None]:
## change the name to something uniqe for each new model
model_dir = 'saved_models/'
model_name = 'facial_keypoints_model.pt'

# after training, save your model parameters in the dir 'saved_models'
torch.save(net.state_dict(), model_dir+model_name)

## Feature Visualization

Sometimes, neural networks are thought of as a black box, given some input, they learn to produce some output. CNN's are actually learning to recognize a variety of spatial patterns and you can visualize what each convolutional layer has been trained to recognize by looking at the weights that make up each convolutional kernel and applying those one at a time to a sample image. This technique is called feature visualization and it's useful for understanding the inner workings of a CNN.

In the cell below, you can see how to extract a single filter (by index) from your first convolutional layer. The filter should appear as a grayscale grid.

In [None]:
# Get the weights in the first conv layer, "conv1"
# if necessary, change this to reflect the name of your first conv layer
weights1 = net.conv1.weight.data

w = weights1.numpy()

filter_index = 0

print(w[filter_index][0])
print(w[filter_index][0].shape)

# display the filter weights
plt.imshow(w[filter_index][0], cmap='gray')


## Feature maps

Each CNN has at least one convolutional layer that is composed of stacked filters (also known as convolutional kernels). As a CNN trains, it learns what weights to include in it's convolutional kernels and when these kernels are applied to some input image, they produce a set of **feature maps**. So, feature maps are just sets of filtered images; they are the images produced by applying a convolutional kernel to an input image. These maps show us the features that the different layers of the neural network learn to extract. For example, you might imagine a convolutional kernel that detects the vertical edges of a face or another one that detects the corners of eyes. You can see what kind of features each of these kernels detects by applying them to an image. One such example is shown below; from the way it brings out the lines in an the image, you might characterize this as an edge detection filter.

<img src='images/feature_map_ex.png' width=50% height=50%/>


Next, choose a test image and filter it with one of the convolutional kernels in your trained CNN; look at the filtered output to get an idea what that particular kernel detects.

### Filter an image to see the effect of a convolutional kernel
---

In [None]:
## load in and display any image from the transformed test dataset
import cv2

image = cv2.imread('images/mona_lisa.jpg')
# convert image to grayscale
image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY) / 255.0

In [None]:
## Using cv's filter2D function
filter_kernel = np.array([[ 0,  1,  1],
                          [-1,  0,  1],
                          [-1, -1,  0]])

filtered_image = cv2.filter2D(image, -1, filter_kernel)

f, (ax1, ax2, ax3) = plt.subplots(ncols=3, nrows=1, figsize=(10, 5))
ax1.imshow(filter_kernel, cmap='gray')
ax2.imshow(image, cmap='gray')
ax3.imshow(filtered_image, cmap='gray')

ax1.set_title('Kernel')
ax2.set_title('Orginal Image')
ax3.set_title('Filtered image')
plt.tight_layout();

In [None]:
## apply a specific set of filter weights (like the one displayed above) to the test image
weights = net.conv1.weight.data.numpy()

filter_kernel = weights[filter_index][0]
filtered_image = cv2.filter2D(image, -1, filter_kernel)

f, (ax1, ax2, ax3) = plt.subplots(ncols=3, nrows=1, figsize=(10, 5))
ax1.imshow(filter_kernel, cmap='gray')
ax2.imshow(image, cmap='gray')
ax3.imshow(filtered_image, cmap='gray')

ax1.set_title('Kernel')
ax2.set_title('Orginal Image')
ax3.set_title('Filtered image')
plt.tight_layout();

---
## Moving on!

Now that we have defined and trained the model (and saved the best model), we are ready to move on to the last notebook, which combines a face detector with your saved model to create a facial keypoint detection system that can predict the keypoints on *any* face in an image!