# Phase 3 Data Science Training Notebook

In [None]:
import pandas as pd
import numpy as np

## Exploratory Data Analysis

In [None]:
# Assume tar.gz file to be extracted at project directory.
def unpickle(file):
    import pickle
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

In [None]:
data_batch_1 = unpickle("data_batch_1")
data_batch_1.keys()

In [None]:
labelNames = unpickle("batches.meta")
labelNames.keys()

In [None]:
print(labelNames[b'num_cases_per_batch'])
print(labelNames[b'label_names'])
labelNames.values()

In [None]:
labelList = labelNames[b'label_names']

In [None]:
data_batch_1[b'batch_label']

In [None]:
print(len(data_batch_1[b'labels']))
print(max(data_batch_1[b'labels']))
print(min(data_batch_1[b'labels']))
labels = data_batch_1[b'labels']

In [None]:
indexForEach = []
for i in range (10):
    temp = []
    for idx, label in enumerate(labels):
        if len(temp) == 5:
            indexForEach.append(temp)
            break
        if label == i:
            temp.append(idx)

indexForEach

In [None]:
data_batch_1[b'data'].shape

In [None]:
import matplotlib.pyplot as plt

def getFiveImages(toShow, index):
    sampleImages = []
    for i in range (5):
        temp =  data_batch_1[b'data'][toShow[i]]
        temp = temp.reshape(3,32,32)
        temp = temp.transpose(1,2,0)
        sampleImages.append(temp)
    fig, axs = plt.subplots(1, 5, constrained_layout=True)
    fig.suptitle("Label " + index + " (" + str(labelList[int(index)]).replace("b'", "").replace("'", "") + ")", fontsize=16)
    fig.set_figwidth(15)
    for j in range (5):
        axs[j].imshow(sampleImages[j])

In [None]:
for i in range (10):
    getFiveImages(indexForEach[i], str(i))

I'd like to use label 2, which appears to be birds

# Data Processing

Now lets unpickle the rest of the files and then see how we can use it to fit a model

In [None]:
testingData = unpickle("test_batch")
data_batch_2 = unpickle("data_batch_2")
data_batch_3 = unpickle("data_batch_3")
data_batch_4 = unpickle("data_batch_4")
data_batch_5 = unpickle("data_batch_5")


To avoid data imbalance, ideally we will have an even amount of each of the labels, all sampled from the different datasets. 

In [None]:
print(len(testingData[b'labels']))
testingData[b'labels'].count(2)



In [None]:
batches = []
batches.append(data_batch_1)
batches.append(data_batch_2)
batches.append(data_batch_3)
batches.append(data_batch_4)
batches.append(data_batch_5)

In [None]:
def formatData(entry):
    temp = entry
    temp = temp.reshape(3,32,32)
    temp = temp.transpose(1,2,0)
    return temp

We need to make the target variable either 1 or 0 based on whether the image in fact belongs to the label that we are interested in

In [None]:
def formatLabels(correct, label):
    if label == correct:
        return 1
    else:
        return 0

In [None]:
combinedTrainData = batches[0][b'data']
combinedTrainLabels = batches[0][b'labels']
for i in range (1,5):
    combinedTrainLabels = np.concatenate((combinedTrainLabels, batches[i][b'labels']))
    combinedTrainData = np.concatenate((combinedTrainData, batches[i][b'data']))
print(len(combinedTrainData))
print(len(combinedTrainLabels))

In [None]:
#for idx, label in enumerate(trainingLabels):
 #   trainingLabels[idx] = formatLabels(2, label) #change to binary

we should check that there is no data imbalance(ie our target is not overrepresented in the data).

In [None]:
list(trainingLabels).count(1) 

In [None]:
testImages = []
trainImages = []
for val in combinedTrainData:
    trainImages.append(formatData(val)) #getting the correct format to display image
for val in testingData[b'data']:
    testImages.append(formatData(val))


fig, axs = plt.subplots(1, 5, constrained_layout=True)
fig.set_figwidth(15)
for j in range (5):
    axs[j].imshow(trainImages[j])

In [None]:
trainingData = np.array(trainImages)
trainingLabels = np.array(combinedTrainLabels)

In [None]:
from keras.utils import to_categorical


train_x = trainingData
train_y = to_categorical(trainingLabels)
print(train_y)
test_x = np.array(testImages)
test_y = to_categorical(np.array(testingData[b'labels']))



# Data Modelling

In [None]:
import tensorflow as tf
from tensorflow.keras import Input, Sequential, optimizers, losses, callbacks
from datetime import datetime
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Rescaling
from keras.layers import Conv2D
from keras.layers import MaxPooling2D
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import BatchNormalization
from keras.layers import Dropout
from keras.optimizers import SGD
from tensorboard.plugins.hparams import api as hp


In [None]:
HP_OPTIMIZER = hp.HParam('optimzer', hp.Discrete('adam', 'sgd'))
HP_DROPOUT = hp.HParam('dropout', hp.RealInterval(0.2,0.8))
with tf.summary.create_file_writer('logs/hparam_tuning').as_default():
    hp.hparams_config(
        hparams=[HP_OPTIMIZER],
        metrics=[hp.Metric('accuracy', display_name='accuracy')]
    )



In [None]:
def trainTestModel(hparams):
    model = Sequential(name="Image_Recognition_Model")
    model.add(Conv2D(32, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same', input_shape=(32, 32, 3)))
    model.add(Rescaling(scale=1./255, name="Normaliser")) # Example pre-processing layer.
    model.add(BatchNormalization())
    model.add(Conv2D(32, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
    model.add(MaxPooling2D((2, 2)))
    model.add(Conv2D(64, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
    model.add(MaxPooling2D((2, 2)))
    model.add(Conv2D(128, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
    model.add(MaxPooling2D((2, 2)))
    model.add(Flatten())
    model.add(Dense(128, activation='relu', kernel_initializer='he_uniform'))
    model.add(Dropout(HP_DROPOUT))
    model.add(Dense(10, activation='softmax'))
    model.compile(optimizer=hparams[HP_OPTIMIZER],
                loss=losses.CategoricalCrossentropy(),
                metrics=['accuracy'])

In [None]:
def generateModel():
    
    model = Sequential(name="Image_Recognition_Model")
    model.add(Conv2D(32, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same', input_shape=(32, 32, 3)))
    model.add(Rescaling(scale=1./255, name="Normaliser")) # Example pre-processing layer.
    model.add(BatchNormalization())
    model.add(Conv2D(32, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
    model.add(MaxPooling2D((2, 2)))
    model.add(Conv2D(64, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
    model.add(MaxPooling2D((2, 2)))
    model.add(Conv2D(128, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
    model.add(MaxPooling2D((2, 2)))
    model.add(Flatten())
    model.add(Dense(128, activation='relu', kernel_initializer='he_uniform'))
    model.add(Dropout(0.5))
    model.add(Dense(10, activation='softmax'))
    opt = SGD(learning_rate=0.001, momentum=0.9)

    model.compile(optimizer=opt,
                loss=losses.CategoricalCrossentropy(),
                metrics=['accuracy'])
    return model

In [None]:
model = generateModel()

# Callbacks for QoL.
log_dir = "output/logs/" + datetime.now().strftime("%Y-%m-%d-%H%M%S")
tensorboard_callback = callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
earlystop_callback = callbacks.EarlyStopping(monitor='val_loss', patience=25, restore_best_weights=True)
model.fit(train_x, train_y,
    validation_data=(test_x, test_y),
    epochs=30,
    batch_size=64, 
    callbacks=[tensorboard_callback, earlystop_callback],
)

model.save('final_model.h5')

In [None]:
from tensorflow.keras.preprocessing.image import load_img
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.models import load_model
 
def loadImage(filename):
	img = load_img(filename, target_size=(32, 32))
	img = img_to_array(img)
	img = img.reshape(1, 32, 32, 3)
	img = img 
	return img

def predictForImage(label):
	img = loadImage('dog.png')
	# load model
	model = load_model('final_model.h5')
	result = model.predict(img)
	print(np.argmax(result, axis=-1))
	print(result[0][label])
	labelNames[b'label_names'][3]

predictForImage(1)