In [31]:
import tensorflow as tf
import tensorflow.keras.datasets as tfds
from utils.layer_units import *
from utils.ResAttNet import ResAttNet
import pydotplus
import numpy as np


# Plot configurations
%matplotlib inline

# Notebook auto reloads code. (Ref: http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython)
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Trainning CIFAR 100 data with ResAttNet
In this file we first load in the training and testing data from CIFAR 10 dataset. We chose the last 1000 out as validation set. We did a preprocess where we pixelwise subtracted the RGB mean across data input data. We didn't scale by 255 because we found it makes the validation accuracy unstable and take longer to train with less accurate result. We build model with ResAttNet class and use experimenting with the various parmeters. We used default Adam optimizer. We do experiment with SGD optimizer as described by in the paper. The accuracy didn't improve one percent above chance for several epochs so we decided to using Adam instead. We also added ImageDataGenerator to augment our data as we learned in early days of exploring. We chose some common image augmentation parameter, including shear, zoom, horizontal flip, slight rotation, horizontal and vertical shift. When we used the augmentation, it raised our accuracy from around 60% to 75%. After the run we saved good model and saved the training history as a csv for plotting. 
Detailed analysis on each parameter:
pre_conv = True, pre_pooling = False, 
attention_num = 3,network_param = [1,1,1],
resid_params = [[64,3,1,0],[64,3,1,0],[64,1,1,0]],
skip_param = [True, 1]
We find adding one convolution and one pooling before the first attention module very helpful, especially the first pooling. It drastically reduced the total parameters needed to be trained with minimal sacrifice in validation accuracy. We also experimented with 2 and 3 residual units. 3 units are slightly better compared to 2 attention units but took twice as long to train. For network_parameter we found the last 1 which indicate how many dense layer to use before the final softmax dense layer. The answer seemed to be 1, we found 0 layer and 2 layer both offered less accurate and stable result. The key tuning involved resid_params. We tried many combination of filter numbers, total number of convolutions and kernel sizes. We believe 64 filters worked best as it has comparable result as 128 filters and took less memory and time to train and it had better result than 32 filters. Also there should be at least two 3*3 kernel in the residual units. We had tried with all 1*1 kernels the accuracy is very low. When we started using 3*3 kernels in two layers. The result drastically improved. However, not much difference observed when usig all 5*5 kernels aside from memory outage. We also experimented with whether to have skip_param in the hourglass module inside the model. It seemed using 1 skip layer is optimal because using 2 sometimes give out of memory message and using 0 leads to worse results. 

In [32]:
(X_train,y_train),(X_test,y_test) = tfds.cifar100.load_data()

In [33]:
X_valid = X_train[49000:]
y_valid = y_train[49000:]
X_train = X_train[:49000]
y_train = y_train[:49000]

In [34]:
# center but not scale data
# we found scale not very stable
def preprocess(X):
    # pixel wise center
    ret = X.astype(np.float32) - np.mean(X,axis=0)
    return ret

In [35]:
X_train = preprocess(X_train)
X_valid = preprocess(X_valid)
X_test = preprocess(X_test)

In [36]:
my_RANet = ResAttNet(100,256)

In [37]:
ipt = tf.keras.Input(shape=(32,32,3))
model_out = my_RANet.build(ipt, pre_conv = True, pre_pooling = False, 
             attention_num = 3,network_param = [1,1,1],
             resid_params = [[64,3,1,0],[64,3,1,0],[64,1,1,0]],
             skip_param = [True, 1])
model = tf.keras.Model(ipt,model_out)
model.summary()

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_6 (InputLayer)            [(None, 32, 32, 3)]  0                                            
__________________________________________________________________________________________________
conv2d_351 (Conv2D)             (None, 32, 32, 64)   1792        input_6[0][0]                    
__________________________________________________________________________________________________
batch_normalization_249 (BatchN (None, 32, 32, 64)   256         conv2d_351[0][0]                 
__________________________________________________________________________________________________
activation_257 (Activation)     (None, 32, 32, 64)   0           batch_normalization_249[0][0]    
____________________________________________________________________________________________

In [38]:
opt = tf.keras.optimizers.SGD(learning_rate=0.1, momentum=0.9,nesterov=True)

model.compile(optimizer = 'Adam',
    loss = 'sparse_categorical_crossentropy',
    metrics = ['sparse_categorical_accuracy'],
)

In [39]:
# ImageDataGenerator code referred from hw2 Task4
# and referred from https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/image/ImageDataGenerator
datagen = tf.keras.preprocessing.image.ImageDataGenerator(shear_range=0.1,zoom_range=0.2,horizontal_flip=True,rotation_range=30,width_shift_range=0.1,
                             height_shift_range=0.1)
datagen.fit(X_train)

In [40]:
#history = model.fit(X_train,y_train,epochs=10,batch_size=128,validation_split=0.2,callbacks=[tensorboard_callback])
bs = 128
history = model.fit(datagen.flow(X_train,y_train,batch_size=bs),epochs=30,steps_per_epoch=len(X_train)//bs,
                    validation_data = (X_valid,y_valid))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [41]:
model.save('cifar_100_three_attention_64_3_64_1_resid.h5')

In [42]:
model.evaluate(x=X_test,y=y_test)



[1.8403987884521484, 0.5126000046730042]

In [1]:
import pandas as pd
hist_normalized = pd.DataFrame(history.history)

NameError: name 'history' is not defined

In [12]:
hist_normalized.to_csv('hist_two_attention_triple_64_3_3_one_dense_no_prepooling.csv',sep=',')