In [1]:


# Notes from weeks 3 and 4 of https://www.coursera.org/learn/convolutional-neural-networks/lecture/nEeJM/object-localization

# to make: 
# apply pre-trained YOLO to a few family Australia photos
# apply pre-trained U-Net to Australia photos
# YOLO with transfer learning applied to some data (could it learn to identify faces? - it's only one class)
# get pre-trained siamese for facial verification, and transfer with new photos of fam



### to identify photos of the same thing from a large pool of photos:
# 1) (maybe - this might eliminate useful information) use YOLO or similar to do object detection and crop image to
# only have object of interest
# (If don't crop image might at least want to make it so the object of interest is in the centre of the image
# so that's consistent across all images: could use YOLO's output for that too)
# 2) train siamese network to recognise when two images are of the same thing, and when they aren't
# Usual things apply: training data of same distribution and balance as real-world data, etc
# Ideally would have multiple images of each item from different angles
# (could then make my own augmentations from there, eg: changing light/tone/rotation/mirror/etc of image)
# If can't identify similar images with high confidence, could at least assign a probability
# which could be combined with other information in Bayes Net to give a final score




# some views on training sizes needed per class:
# https://stackoverflow.com/questions/55356982/how-many-imagesminimum-should-be-there-in-each-classes-for-training-yolo




import numpy as np




In [3]:
# Cell 1
# detection problem: where you have multiple objects in image and want to localise (get bounding box) for all of them


# to get bounding box:
# penultimate layer feeds into classification softmax layer, and also into a layer which 
# predicts a bounding box (or maybe combines them both, as per the vectors feeding into the loss func
# defined below)

# (0,0) is top left; (1,1) is bottom left
# Bounding box prediction takes the form of 4 values:
# bx, by (mid of rect coords); bw (width of rect); bh (height of rect)

# Y (vector to predict) for single value with 4 possible classification values 
# (of which 1 is 'nothing there') has 8 values:
# Pc (prob anything other than 'nothing' is there)
# bx by bw bh
# c1, c2, c3 (probs assigned to each of the not-nothing classifications)

# if Pc = 'nothing there', then dont care about all other values. Think they aren't evaluated in the loss
# function in this case (perhaps)

def mse_loss_localisation_single_object(y_hat, y):
    """input two 1d numpy arrays"""
    if y[0] == 1:           
        return np.sum(np.power(y_hat - y, 2))  # if there is something in the image: is the classification right
                                            # and how close is the bounding box
    if y[0] == 0:
        return np.power(y_hat[0] - y[0], 2)   # if nothing in image then only evaluate this 
    




In [None]:
# landmark detection: return x and y coords of something interesting in an image
# can modify NN to do that by adding two more values to output vector, presumably
# with an extra value to tell you if the object of interest is present at all (eg: something
# to tell you where nose and eyes are on pictures of people, and 1+ images have no faces in them
# so add term for whether face is present or not)

# landmarks can be defined on the face, say 64 key landmarks, which effectively become
# features extracted from pictures of people's faces 

def landmark_feature_counter(landmarks_count):
    return 2 * landmarks_count + 1   # the +1 is the indicator for whether obj of interest (eg face) is there at all

# AR uses landmarks to work out how best to append objects to pictures of faces

# landmark can also tell you people's actions or poses, by landmarking differnt parts of the body



In [None]:
# initial training might have heavily cropped images

# sliding windows: scanning across image several times over with gradually increasing window sizes, putting
# each window image through a CNN which has been trained on heavily cropped images of the thing you're
# interested in (eg: cars, where the car takes up almost the whole image)


# Sliding windows can be sped up by implementing it "convolutionally". This means:
# Converting all FC layers to 1x1 conv layers. This can be done without changing the operations done, only
# the dimensions of tensors passed around
# However this would still meaning passing the same number of bounding boxes through the network
# So conv the whole image (or a wider part of it) and put it through the process, giving you
# bigger layers for the conv layers which replace the FC ones , and each square in the tensor
# gives you the info you'd have from a whole freeze-frame from a sliding window
# eg if you were to slide in 2x2 windows, you'd end up with 2x2xC data
# This saves some compute
# It can be done in a bigger scale, eg: replacing 8x8 sliding window with one conv layer


# see the 10:00 mark for diagram on how conv on a single freezeframe of a window can do the same thing
# as lots of frozen windows using FC 
# https://www.coursera.org/learn/convolutional-neural-networks/lecture/6UnU4/convolutional-implementation-of-sliding-windows



# 1x1 conv layers can be effectively do the same operations as FC layers, see here: 
# https://www.coursera.org/learn/convolutional-neural-networks/lecture/6UnU4/convolutional-implementation-of-sliding-windows


In [None]:
# With the above, the bounding boxes won't be too accurate (output is just whether something is present in a given
# sliding window). To improve this use Yolo:

# divide image into grid (19x19 is common)
# put each individual image through algo set out in Cell 1 above to detect class and bounding box
# objects are assigned to the grid-image which contains it's midpoint
# Works fine so long as no more than 1 object in each cell (or empty)

# because it uses convolution to save iteratively moving over sliding windows it can be used for realtime
# object detection

# for bounding box: within each window, the top left is 0,0 the buttom right is 1,1 
# bw or bh could be greater than 1 if the bounding box is bigger than the individual window (this happens)




In [None]:
def intersection_over_union(area_intersection, area_union):
    """intersection over union (IoU) for two bounding boxes
    
    Generally, if IoU > 0.5 the predicted bounding box will be judged close enough to the real bounding
    box to be "correct" (could set higher than 0.5 threshold if you like)
    
    IoU can also be used as general metric of overlap between two boxes
    """
    return area_intersection / area_union





In [None]:
## non-max suppression: prevents you detecting the same object more than once

# each window will give a probability of detecting an object, so where there's overlap (using IoU)
# set probabilities to zero for all windows apart from the one with the highest probability

### full algo:
# 1) discard all probs with probability <= 0.6
# 2) pick box with highest probability for that class
# 3) discard any remaining boxes with IoU > 0.5 of this chosen box
# So if IoU < 0.5, it will be detected as a separate instance of that object


# non max supression: 
# https://www.coursera.org/learn/convolutional-neural-networks/lecture/dvrjH/non-max-suppression

In [None]:
## Anchor boxes:

# define rough shape you expect for a given class (eg tall rect for person, fat rect for car),
# then output vector has usual_len (eg: 8 values if using example in Cell 1) * total_anchor_boxes (eg: 3 if 
# looking for 3 classes of object)
# objects are assigned an anchor box by whichever anchor box best fits it (highest IoU), and
# non-max suppression is only done for objects in the same assigned anchor box
# this allows for overlapping objects of different types and shapes, such as a person standing
# in front of a car


# cant handle more objects in one grid cell than there are anchor boxes (doesnt happen much if you use 19x19 grid)


# ways of chosing anchor boxes:
# by judgement
# k-means clustering of bounding boxes in real data to find archetypical anchor box shapes which 
# represent one or more classes


# more on anchor boxes:
# https://www.coursera.org/learn/convolutional-neural-networks/lecture/yNwO0/anchor-boxes



# yolo is a FCN = fully convolutional network

In [None]:
def len_of_output_vec(no_of_classes, no_of_anchor_boxes):
    """Guessing youd often have same number of anchor boxes as classes, but not always"""
    return (5 + no_of_classes) * no_of_anchor_boxes


# if you break grid into 19*19, and len_of_output_vec = 24, then output from conv net for one image would
# be 19*19*24
# or maybe 19*19*3*8 (if each of the 3 anchor boxes has its own array of len 8)





# summary of YOLO algo process:
# https://www.coursera.org/learn/convolutional-neural-networks/lecture/fF3O0/yolo-algorithm


In [None]:
# R-CNN: Regional CNN
# uses 'segmentation algorithm' to decide which regions are worth scanning, so not all image is processed
# by CNN, but only those which seem interesting (saving computer on trying to classify bits of sky, for instance)

# was fast, Fast R-CNN faster
# Faster R-CNN uses conv. net to propose regions, making it faster, but YOLO is still quite a bit faster

# Andrew Ng doesnt use R-CNN



In [None]:
# semantic segmentation: aims to put exact outline around object to the exact pixel

# U-net is an algo that labels every single pixel as either 'nothing' of one of the classes proposed

# U-net architecture looks like CNN as dims reduce and channels increase, however instead of going through
# FC midway, it reverses dimensional direction and starts reducing channels and increasing dimensions in 
# each convolution until it outputs a tensor of the same dimensions at the input image (which is necessary
# as its putting a classification value to each pixel)

# uses transpose convolutions to get from smaller matrix to larger one
# transpose convolution uses a conv matrix larger than the input matrix, where each
# value in the input matrix is, one at a time, multiplied by all values in 
# the convo matrix, the values from which then go to the output matrix, with the
# transpose-conv output matrices from each individual input projected onto the 
# main output matrix in much the same way as a standard conv process scans: moving
# along with a certain stride size. Where individual transposed conv. matrices overlap
# the results are summed

# summary of transpose conv process:
# https://www.coursera.org/learn/convolutional-neural-networks/lecture/kyoqR/transpose-convolutions


# U-net uses skip-connections: passing outputs from one of the first conv layers straight to its mirror 
# layer (in the sense it's the same dimensions) nearer the end of the network. This gives this late-layer
# more high-res pixel-level info on where things are in the image which the data which has been through the
# many convolutions won't have (as it's lost a certain amount of resolution in doing this)


# there are multiple skip-connection links between layers in U-net

# output for single image = original_image_width * original_image_height * number_of_classes



# full u-net architecture overview:
# https://www.coursera.org/learn/convolutional-neural-networks/lecture/GIIWY/u-net-architecture


In [None]:
### Week 4!


In [None]:
# facial verification: have one image of face and have to determine if that is the face it claims to be
            # (a 1:1 test)
    
# facial recognition: see if image of a face is anyone in a list of people (harder as much more room to make mistake)
        # (a 1:k test)


    


In [None]:
# facial verification is a one-shot problem: learning to recognise the person from a single training image
# 


# use a similarity function: calculates degree of difference between images im1 and im2
# below a certain similarity score you say the person is who they claim to be


# Siamese network: only one network really, which involves conv. then FC layers, ending in a 
# vector (length 128 in the video) from the input of a face. Then compute the similarity
# of the two 128 vectors which represent two faces


# When training siamese network: want to ensure that if two pictures are of the same person
# the difference between the output vectors is small, and big if the people are different
# This is done in the network "DeepFace"


# 




In [None]:
# Triplet loss: taking reference image, then getting similarity score between it and:
    # 1) another image of that person
    # 2) image of someone else
# want similarity to be higher for image of same person by a 'margin' parameter which we set
    
triplet_loss_single_trio_of_encoded_images(reference, same, different, margin):
    """
    loss is 0 if difference between reference and 'same' is at least margin's value lower than
    diff between reference and 'different', otherwise it's positive
    """
    result = np.sum(np.abs(reference - same))**2 - np.sum(np.abs(reference - different))**2 + margin
        ### CHECK the above: the sum of 128 values might come after this whole formula rather than be inside it
    return np.max(result, 0)   


# though the videos talk about one-shot learning, to train you need several pictures of people
# so you can train the model

# when making training set, pick images which are similar to the reference for the non-match person
# so the model learns better distinctions between individual people
# If you choose the non-match photo randomly, it will be too easy for the model, and it won't 
# learn much while getting very little loss on training


    
# more on triplet loss about halfway thru this vid:
# https://www.coursera.org/learn/convolutional-neural-networks/lecture/HuUtN/triplet-loss



# could put the element-wise difference between two encoded faces into a logistic regression to 
# predict if same face or not (binary prediction)
# a couple of other tweaks one could make:
# https://www.coursera.org/learn/convolutional-neural-networks/lecture/xTihv/face-verification-and-binary-classification



    
    


    
    