In [None]:

# Notes from week 2 of https://www.coursera.org/learn/convolutional-neural-networks/home/week/2




In [None]:
# says if a certain NN architecture works well on one task, its likely to do well on other tasks
# so good to be familiar with some effective existing architectures

# LeNet-5: conv/maxPool2/conv/maxPool2/FC/FC/output_with_softmax (60k params overall)

# c = channel count, s = stride, w = width of conv matrix in pixels
# AlexNet: 227px wide/conv(w=11,s=5,c=96)/maxpool(w=3,s=2)/conv(w=5,s=1,c=256)/maxpool(w=3,s=2)/
#          conv(w=5,s=1,c=256)/conv(w=5,s=1,c=256)/conv(w=5,s=1,c=256)/ [same conv layers x3 times here]
#.         maxpool(w=3,s=2)/unravel to 9216 features (6*6*256)/
#          FC(9216->4096) / FC(4096->4096) / FC(4096->1000)output_with_softmax
# 60million params in total
# Takes 227*227*3 images
# originally trained on 1.2m images of 1000 classes
# pre-trained AlexNet is available as a PyTorch file: https://www.kaggle.com/pytorch/alexnet

# VGG-16: deeper network, using padding so dimensions are same after each conv layer 
#    dims get smaller with each pool tho
#.   all conv layers are 3*3, stride = 1, channels count denoted by square bracket below
#.    all pool layers are 2*2, stride = 2
#    where identical layers are repeated, denoted as x2/x3/etc below
# 224/3 (224*224*3) start/
#conv[64]x2/pool   (112/64)
# conv[128]x2/pool  (56/128)
# conv[256]x3/pool/  (28/256)
# conv[512]x3/pool/  (14/512)
# conv[512]x3/pool   (7/512)
# FC(7*7*512 in, 4096 out)
# FC (4096 in, 4096 out)
# FC with softmax output (4096 in, 1000 out)
### 138million weights

# VGG-19: newer version of VGG-16, which performs slightly better





In [1]:
# ResNets are made of "residual blocks"

# a residual block is two FC layers with relu activation, but the data that feeds into the first layer
# is preserved, and added to the input of the 2nd relu activation, looks like:

# resnet method can be applied to any layer type (eg: convolutions)
# not sure if activation is always relu. 
# structure:
# input -> layer(W1) -> relu -> layer(W2) -> add input to this -> relu 

# stack lots of residual blocks to make network
# says using residual blocks lets you make much deeper networks

# with normal NNs the loss on the training set can increase as you add layers past a certain point (if too deep)
# resnets only get better with more depth (on the training set at least)
# how? perhaps to do with taking untransformed data forward, so giving NN a better 'memory' of old layers
# res blocks give NN the option of effectively making no change in a layer (eg: if weights are zero
# then it just applies relu to a dataset that's already been relu'd)

# dimensions tend not to change within the residual block

# for good diagram on resnets:
# https://www.coursera.org/learn/convolutional-neural-networks/lecture/HAhz9/resnets

# why resnets work:
# https://www.coursera.org/learn/convolutional-neural-networks/lecture/XAKNO/why-resnets-work




In [3]:
# 1x1 convolution: if a 2d image just multiplies each value by the 1x1's value

# if image has >1 channels, aggregates by channels multiplying channel values with
# 1x1xchannel_count, then sum and put through relu

# eg: so if have 6x6x10 image, have 1x1x3 multiplier, for each 3rd-dim line (len=10) in the 6x6 grid,
# sum those values multiplied by the 1x1x1 value in the pooling multiplier, then do relu on them, 
# giving 6x6 grid. 
# then do this 3 times (once for each value in 1x1). Each iter will give a 6x6 output, so a 1x1x3 multiplier
# gives a 6x6x3 output

# so a good way to reduce 5x5x200 image to 5x5x40 without losing much information might be 
# to put it through a 1x1x40 "1x1 conv" layer 

# sometimes called 'network in network'




In [4]:
# inception network

# does multiple conv operations on the same input, ensuring they all output the same 2d dimensions, and
# stacking the operation outputs channel-wise. the operations include conv layers with different sized
# conv matrices and pooling

# vs 5x5 conv matrix, 1x1 conv matrix is about one tenth the computational cost. so better to do 1x1 (it reduces dims)
# before the 5x5, all else being equal
# adding a 1x1 conv layer prior to a 5x5 to improve performance is called adding a 'bottleneck' layer
# says doing this tends not to hurt performance (and performance cost remains at about 1/10 the cost of 
# just the 5x5 layer, even though you're processing 2 layers instead of 1)


# inception module might look like: input from previous layer goes through all of these in parallel:
# 1x1 conv -> 3x3 conv
# 1x1 conv -> 5x5 conv
# 1x1 conv
# maxpool, s=1, output dim = same -> 1x1 conv (to reduce number of channels otherwise pooling layer channels
                # could dominate)
## stack all 4 outputs channel-wise



# inception network is made up blocks of inception modules (sandwiched by a few beginning and
# end-stage processing layers)
# also can have sidebranches, which take a layer's output and put it through FC and softmax to make
# a prediction (can see how prediction accuracy changes at different layers of the network, and
# check weights of intermediate layers arent hugely twisting the inputs - ie, the results should
# be roughly on track if not performing as well as the final output layer)


# other versions of inception have now been made (inception2, 3, etc) and in combination with resNets




In [5]:
## MobileNet aka Depthwise Separable Convolution

# for use with little compute - so can be applied on phones
# 

## Process
# 1) for each channel of input image, apply a separate conv matrix (each channel is treated separately
# with it's own conv matrix). Returns a slightly smaller image (if no padding) with same channel count
# 2) apply 'pointwise' conv (aka 'projection'), involving 1x1xchannel_count multiplied by each square in the input
# tensor (so each 'front square' gets 3 multiplications which are summed, if there are 3 channels)
# This returns a 2d image of same dims as input at start of (2). Can do this step multiple times 
# with different 1x1xchannel_count inputs, each returning a new output channel (making the output
# to this step 3d)
# 
# these 2 processes are called Depthwise and Pointwise. Combined they are DepthWise Separable Convolution
# and doing these can save a lot of compute vs doing a usual convolution
# 

# costs of doing this vs standard conv can be estimated with a formula:
def mobilenet_cost_vs_conv(channel_count, conv_matrix_size):
    return (1 / channel_count) + (1 / conv_matrix_size**2)

mobilenet_cost_vs_conv(5, 3) # the coursera example. 
# Says IRL this number will be smaller - maybe around 0.1



# mobilenet explanation:
# https://www.coursera.org/learn/convolutional-neural-networks/lecture/B1kPZ/mobilenet


0.3111111111111111

In [None]:
## MobileNet architecture:
# 13 repeated layers of Depthwise Separable Convolution, following by pooling and output softmax


# MobileNetv2 architecture:
# adds 
# 1) "residual connection": taking output from one layer and passing it directly to output of the next - I
# think it stacks channel-wise or maybe aggregates into activation func at end of conv layer (similar to resNet)
# 2) adds 'expansion layer' prior to depthwise or pointwise operations: replicating input channel-wise N
# times. eg if N=6, 4x4x3 input becomes 4x4x18

# expansion layer + depthwise layer + pointwise (aka projection) layer = 'bottleneck block' (when all 3 combined)

# v2 has 17 bottleneck blocks, then pooling and output softmax



In [None]:
## things you can do to change where you sit on compute/performance scale:
# change image resolution
# make network deeper
# increase 'width' of network (number of operations happening on each layer: so more nodes in FC layers
# and bigger outputs from conv layers)

# EfficientNet provides process for finding best tradeoff and combination of these for your implementation
# inc your hardware limitations. Not 100% this is correct as it's also available as a pre-trained network
# itself:
# https://github.com/lukemelas/EfficientNet-PyTorch

# 


In [None]:
# can be hard to replicate exact model architecture based on papers: if they have github code
# good to start with that, rather than from scratch


# if you're trying something new, look for existing work on similar tasks, and borrow code creating 
# network which is known to work on for that





In [None]:
### transfer learning:
# get rid of existing output layer (which will often be softmax), and make your own
# with the classifications you want (eg: photo of anya, dad, or someone else)
# 
# think of weights for all layers apart from output layer as frozen (might have to manually freeze these
# in pytorch using some parameter/code)
#
# put all inputs in your training data through frozen layers, saving output of final frozen layer
# then train a shallow softmax model to predict result, based on input from this final frozen layer, 
# 
# if you have a larger training set, you could unfreeze some of the later layers (if do this have
# option of using existing weights as initial weights, or can start with random weights)

# rule of thumb: the more data you have the more layers you might unfreeze

# says you should almost always do transfer learning if doing computer vision, unless you have tonnes
# of resources




In [None]:
## data augmentation methods
# mirroring
# random cropping
# rotation
# shearing
# local warping
# color shifting (adding or subtracting randomly to RGB channels, often by small amounts drawn from distributions)
#.  'PCA colour augmentation' is a method using PCA to scale RGB in a way that keeps tints representative of the real
        # world
#     



## where training set doesnt fit in memory, might have a streaming process that loads one mini-batch's worth 
# of data from HDD, applies distortions if needed, then feeds it through training iteration on gradient descent
# and so on




In [None]:
## tips for doing well on benchmarks:
# ensembling (tends not to be used in production as it might not be worth the extra compute)
# Multi-crop (at test time): run classifier on multiple versions of test images and average the results, eg:
    # images might be mirrored and randomly cropped . '10 crop' is one popular one
    # Also tends not to be used in production, though far less expensive than ensembling
# 

