In [None]:
import tensorflow_datasets as tfds
import tensorflow as     tf
import math
import numpy             as np
import matplotlib.pyplot as plt
from tensorflow.keras.layers import *
print(tf.__version__)

In [None]:
from utils.cifar_functions import *

#PREPARE THE CIFAR DATASET
# download data and split into training and testing datasets
dataset_train, info = tfds.load("cifar10", split=tfds.Split.TRAIN, with_info=True)
dataset_test,  info = tfds.load("cifar10", split=tfds.Split.TEST,  with_info=True)

dataset_train = dataset_train.map(pre_processing_train, num_parallel_calls=4)
dataset_train = dataset_train.shuffle(buffer_size=TRAINING_SHUFFLE_BUFFER)
dataset_train = dataset_train.batch(TRAINING_BATCH_SIZE)
dataset_train = dataset_train.prefetch(buffer_size=3)

# transform testing dataset
dataset_test = dataset_test.map(pre_processing_test, num_parallel_calls=4)
dataset_test = dataset_test.batch(TRAINING_BATCH_SIZE)
dataset_test = dataset_test.prefetch(buffer_size=3)

# Inception V1 ([paper](https://arxiv.org/pdf/1409.4842v1.pdf))

<img src="imgs/inceptionv1_block.png">

## Notes
*The main idea of the Inception architecture is based on finding out how an optimal local sparse
structure in a convolutional vision network can be approximated and covered by readily available
dense components*


*This leads to the second idea of the proposed architecture: __judiciously applying dimension reductions and projections wherever the computational requirements would increase too much otherwise.__
This is based on the success of embeddings: even low dimensional embeddings might contain a lot
of information about a relatively large image patch. However, embeddings represent information in
a dense, compressed form and compressed information is harder to model. represent information in
a dense, compressed form and compressed information is harder to model. We would like to keep
our representation sparse at most places (as required by the conditions of [2]) and compress the
signals only whenever they have to be aggregated en masse. That is, __1×1 convolutions are used to
compute reductions before the expensive 3×3 and 5×5 convolutions.__*

*One of the main beneficial aspects of this architecture is that it allows for increasing the number of
units at each stage significantly without an uncontrolled blow-up in computational complexity. The
ubiquitous use of dimension reduction allows for shielding the large number of input filters of the
last stage to the next layer, first reducing their dimension before convolving over them with a large
patch size. Another practically useful aspect of this design is that it aligns with the intuition that
__visual information should be processed at various scales and then aggregated so that the next stage
can abstract features from different scales simultaneously.__*

*__By adding auxiliary classifiers connected to
these intermediate layers, we would expect to encourage discrimination in the lower stages in the
classifier, increase the gradient signal that gets propagated back, and provide additional regularization__*


*Our training used asynchronous stochastic gradient descent with 0.9 momentum [17], fixed learning rate schedule (decreasing the learning rate by 4% every 8 epochs). Polyak averaging [13] was used to create the final model used at inference time.*


<img src="imgs/lenet.PNG">

In [None]:
def generic_tail(inputs, dims):
    return Conv2D(dims, (3,3), activation='relu', padding='same')(inputs)

def InceptionV1(inputs, squeeze_dims, out_dims, strides=(1,1)):
    """
    Inception V1 Conv Block (GoogLeNet).
    """
    x1 = conv_block(inputs, out_dims, kernel_size=(1,1))
    
    x2 = conv_block(inputs, squeeze_dims, kernel_size=(1,1))
    x2 = conv_block(x2, out_dims, kernel_size=(3,3))
    
    x3 = conv_block(inputs, squeeze_dims, kernel_size=(1,1))
    x3 = conv_block(x3, out_dims, kernel_size=(5,5))
    
    x4 = MaxPool2D(pool_size=(3,3), strides=(1,1))(inputs)
    x4 = conv_block(inputs, out_dims, kernel_size=(1,1))
    
    concat = tf.concat([x1,x2,x3,x4],3)
    return concat

def generic_head(inputs, dims=None):
    x = GlobalAveragePooling2D()(inputs)
    return Dense(DATA_NUM_CLASSES, activation='softmax')(x)

In [None]:
inceptionv1 = VGG_Like_CNN(generic_tail,
            InceptionV1,
            generic_head,
            input_shape = (DATA_CROP_ROWS, DATA_CROP_COLS, DATA_CHANNELS),
            num_levels= 3,
            num_downsamples=2,
            block_repeats=BLOCK_REPEATS)

resnext.compile(optimizer = tf.keras.optimizers.Adam(TRAINING_LR_MAX),
              loss='sparse_categorical_crossentropy',
               metrics=['accuracy'])

In [None]:
print("Number of trainable parameters: ", get_num_params(inceptionv1))

In [None]:
hist = train(inceptionv1, dataset_train, dataset_test, 'inceptionv1', logs=True)

# Inception V2 ([paper](https://arxiv.org/pdf/1512.00567v3.pdf))

*Avoid representational bottlenecks, especially early in
the network. One should avoid
bottlenecks with extreme compression. In general the
representation size should gently decrease from the inputs to the outputs before reaching the final representation used for the task at hand.*

*Spatial aggregation can be done over lower dimensional embeddings without much or any loss in representational power. For example, before performing a
more spread out (e.g. 3 × 3) convolution, one can reduce the dimension of the input representation before
the spatial aggregation without expecting serious adverse effects.*

__Inception V2__ focused on: 
- avoiding representational bottlenecks with extreme compression. 
- converting the 5x5 convolution to two stacked 3x3 convolutions. 
- Also focuses on reducing compute by factorizing an nxn convolution to an nx1 and 1xn convolutions.

In [None]:
def InceptionV2_fig5(inputs, squeeze_dims, out_dims, strides=(1,1)):
    """
    Inception V2 Figure 5
    """
    x1 = conv_block(inputs, squeeze_dims, kernel_size=(1,1))
    
    x2 = conv_block(inputs, squeeze_dims, kernel_size=(1,1))
    x2 = conv_block(x2, out_dims, kernel_size=(3,3))
    
    x3 = conv_block(inputs, squeeze_dims, kernel_size=(1,1))
    x3 = conv_block(x3, out_dims, kernel_size=(3,3))
    x3 = conv_block(x3, out_dims, kernel_size=(3,3))
    
    x4 = MaxPool2D(pool_size=(3,3), strides=(1,1))(inputs)
    x4 = conv_block(inputs, squeeze_dims, kernel_size=(1,1))
    
    concat = tf.concat([x1,x2,x3,x4],3)
    return concat

def InceptionV2_fig6(inputs, squeeze_dims, out_dims, strides=(1,1)):
    """
    Inception V2 figure 6
    """
    x1 = conv_block(inputs, squeeze_dims, kernel_size=(1,1))
    
    x2 = conv_block(inputs, squeeze_dims, kernel_size=(1,1))
    x2 = conv_block(x2, out_dims, kernel_size=(3,1))
    x2 = conv_block(x2, out_dims, kernel_size=(1,3))
    
    x3 = conv_block(inputs, squeeze_dims, kernel_size=(1,1))
    x3 = conv_block(x3, out_dims, kernel_size=(3,1))
    x3 = conv_block(x3, out_dims, kernel_size=(1,3))
    x3 = conv_block(x3, out_dims, kernel_size=(3,1))
    x3 = conv_block(x3, out_dims, kernel_size=(1,3))
    
    x4 = MaxPool2D(pool_size=(3,3), strides=(1,1))(inputs)
    x4 = conv_block(inputs, squeeze_dims, kernel_size=(1,1))
    
    concat = tf.concat([x1,x2,x3,x4],3)
    return concat

# Inception V4 ([paper](https://arxiv.org/pdf/1602.07261.pdf))

## Notes

Inception V4 introduces a new tail design that features:
- parallel stride 2 3x3 conv and 3x3 max pool (concatenated)
- parallel 3x3 conv and factorized 7x1 and 1x7 conv

<img src="imgs/inceptionv4_tail.png">

There a