In [2]:
import json
import pickle
import random
import math
from time import time
import tensorflow as tf
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, LSTM, concatenate, Dense, Input, Activation, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy

In [3]:
class ImageCaptionEmbeddingWithConsensus:
    def __init__(self, image_name, caption, image_embedding, caption_embedding, consensus):
        self.image_name = image_name
        self.caption = caption
        self.image_embedding = image_embedding
        self.caption_embedding = caption_embedding
        self.consensus = consensus

In [4]:
image_caption_objects = 'corrected_images_captions_with_embeddings.pkl'
mirrored_objects = 'mirrored_images_captions_with_embeddings.pkl'
aug_data = 'augmented_data_embeddings.pkl'
aug_data_mirrored = 'augmented_data_embeddings_mirrored.pkl'

# encoding = latin1: needs to be specified in order to unpickle numpy arrays from Python2 to Python3
with open(image_caption_objects, 'rb') as f1:
    data_list_1 = pickle.load(f1, encoding="latin1")
    print(len(data_list_1))
    
with open(mirrored_objects, 'rb') as f2:
    data_list_2 = pickle.load(f2, encoding='latin1')
    print(len(data_list_2))
    
with open(aug_data, 'rb') as f3:
    data_list_3 = pickle.load(f3, encoding='latin1')
    print(len(data_list_3))
    
with open(aug_data_mirrored, 'rb') as f4:
    data_list_4 = pickle.load(f4, encoding='latin1')
    print(len(data_list_4))

12039
12039
5000
5000


In [None]:
data_list = data_list_1 + data_list_2 + data_list_3 + data_list_4
print(len(data_list))

In [31]:
# This cell is here to run the model with human only ratings - Don't run this unless needed
# In these pkl files, the "consensus" field contains the human average of the human ratings
image_caption_objects = 'human_scores_only_embeddings.pkl'
mirrored_objects = 'human_scores_only_embeddings_mirrored.pkl'

with open(image_caption_objects, 'rb') as f1:
    data_list_1 = pickle.load(f1, encoding="latin1")
    print(len(data_list_1))
    
with open(mirrored_objects, 'rb') as f2:
    data_list_2 = pickle.load(f2, encoding='latin1')
    print(len(data_list_2))
    

data_list = data_list_1 + data_list_2
print(len(data_list))

12039
12039
24078


In [5]:
# This cell is here to run the model with vsepp only ratings - Don't run this unless needed
# In these pkl files, the "consensus" field contains the vsepp bucketed rating (as pulled from game data)
# Since we are not relying on human ratings here, we have in total 24,078 samples (12,039 original + mirrored), without aug
# With augmented data, we have 34,078
image_caption_objects = 'vsepp_scores_only_embeddings.pkl'
mirrored_objects = 'vsepp_scores_only_embeddings_mirrored.pkl'
aug_data = 'augmented_data_embeddings.pkl'
aug_data_mirrored = 'augmented_data_embeddings_mirrored.pkl'

with open(image_caption_objects, 'rb') as f1:
    data_list_1 = pickle.load(f1, encoding="latin1")
    print(len(data_list_1))
    
with open(mirrored_objects, 'rb') as f2:
    data_list_2 = pickle.load(f2, encoding='latin1')
    print(len(data_list_2))
  
with open(aug_data, 'rb') as f3:
    data_list_3 = pickle.load(f3, encoding='latin1')
    print(len(data_list_3))
    
with open(aug_data_mirrored, 'rb') as f4:
    data_list_4 = pickle.load(f4, encoding='latin1')
    print(len(data_list_4))

data_list = data_list_1 + data_list_2 + data_list_3 + data_list_4
print(len(data_list))

12039
12039
5000
5000
34078


In [6]:
# 10/17
# ----------------------------

# This cell is here to run the model with vsepp only ratings from all data
# In these pkl files, the "consensus" field contains the vsepp bucketed rating (as pulled from game data)
# Since we are not relying on human ratings here, we have in total 24,078 samples (12,039 original + mirrored), without aug
# Using 2 augmented datasets - the 5000 image/captions + mirrored from 9/13, and the 18,500 image/captions + mirrored from 10/17
# With all this data, we have 71,078 image/caption/ratings

image_caption_objects = 'vsepp_scores_only_embeddings.pkl'
mirrored_objects = 'vsepp_scores_only_embeddings_mirrored.pkl'
aug_data = 'augmented_data_embeddings.pkl'
aug_data_mirrored = 'augmented_data_embeddings_mirrored.pkl'
aug_data_2 = 'all_augment_data_10_17_embeddings.pkl'
aug_data_2_mirrored = 'all_augment_data_10_17_embeddings_mirrored.pkl'

with open(image_caption_objects, 'rb') as f1:
    data_list_1 = pickle.load(f1, encoding="latin1")
    print(len(data_list_1))
    
with open(mirrored_objects, 'rb') as f2:
    data_list_2 = pickle.load(f2, encoding='latin1')
    print(len(data_list_2))
  
with open(aug_data, 'rb') as f3:
    data_list_3 = pickle.load(f3, encoding='latin1')
    print(len(data_list_3))
    
with open(aug_data_mirrored, 'rb') as f4:
    data_list_4 = pickle.load(f4, encoding='latin1')
    print(len(data_list_4))
    
with open(aug_data_2, 'rb') as f5:
    data_list_5 = pickle.load(f5, encoding='latin1')
    print(len(data_list_5))
    
with open(aug_data_2_mirrored, 'rb') as f6:
    data_list_6 = pickle.load(f6, encoding='latin1')
    print(len(data_list_6))

data_list = data_list_1 + data_list_2 + data_list_3 + data_list_4 + data_list_5 + data_list_6
print(len(data_list))

12039
12039
5000
5000
18500
18500
71078


In [5]:
print(data_list[1089].consensus)

3


In [7]:
# To concatenate, use numpy concatenate. axis = 1 results in a numpy array [[2048]], need to access [0]
im_emb = data_list[0].image_embedding
cap_emb = data_list[0].caption_embedding
concat = np.concatenate((im_emb, cap_emb), axis = 1)

In [8]:
# May need to exclude some data - especially data with no human ratings
# Note that it seems like with no human ratings, there is still consensus.
# And if there are no cosine similiarty data, there is still consensus ('Z0', 'Z1' don't always exist)
#list(filter(lambda x: x < 0, number_list))
cleaned_data_list = list(filter(lambda obj: float(obj.consensus) > 0, data_list))
random.shuffle(cleaned_data_list)
print(len(cleaned_data_list))

71078


In [9]:
def make_even_data(game_data_list):
    #print(len(game_data_list))
    evened_list = []
    list_of_consensus = []
    for obj in game_data_list:
        list_of_consensus.append(int(float(obj.consensus)))
    
    count_1 = list_of_consensus.count(1)
    count_2 = list_of_consensus.count(2)
    count_3 = list_of_consensus.count(3)
    count_4 = list_of_consensus.count(4)
    count_5 = list_of_consensus.count(5)
    
    #print(count_1, count_2, count_3, count_4, count_5)
    #print(count_1 + count_2 + count_3 + count_4 + count_5)
    min_count = min([count_1, count_2, count_3, count_4, count_5])
    
    counts = [min_count, min_count, min_count, min_count, min_count]
    for item in game_data_list:
        if counts[int(float(item.consensus))-1] > 0:
            evened_list.append(item)
            counts[int(float(item.consensus))-1] -= 1
            
    print(min_count * 5)
    print(len(evened_list))
    return evened_list

In [58]:
# Evening out the classes doesn't improve anything
#cleaned_data_list = make_even_data(cleaned_data_list)
#random.shuffle(cleaned_data_list)

21180
21180


In [10]:
im_cap_concat_list = [np.concatenate((obj.image_embedding, obj.caption_embedding), axis=1)[0].tolist() for obj in cleaned_data_list]
im_cap_concat = np.array(im_cap_concat_list)

# Note that the consensus is rounded, then subtract 1 so that the scores are 0 - 4
labels_list = [round(float(obj.consensus)) - 1 for obj in cleaned_data_list]
labels = np.array(labels_list)

In [11]:
print(len(im_cap_concat))
print(len(labels))
print('Count 1 = ', labels_list.count(0))
print('Count 2 = ', labels_list.count(1))
print('Count 3 = ', labels_list.count(2))
print('Count 4 = ', labels_list.count(3))
print('Count 5 = ', labels_list.count(4))
print('Total = ', labels_list.count(0) \
      + labels_list.count(1) \
      + labels_list.count(2) \
      + labels_list.count(3) \
      + labels_list.count(4))

71078
71078
Count 1 =  16140
Count 2 =  8950
Count 3 =  14534
Count 4 =  18462
Count 5 =  12992
Total =  71078


In [13]:
print(len(im_cap_concat))
print(len(labels))
print(len(im_cap_concat[0]))
print(im_cap_concat.shape)
print(im_cap_concat[0].shape)
print(im_cap_concat[0])

71078
71078
2048
(71078, 2048)
(2048,)
[-0.027239   -0.01018333 -0.00098677 ...  0.01488507  0.01743431
  0.03838466]


In [14]:
early_stopping_monitor = EarlyStopping(patience=3)
batch_size = 100
num_epochs = 1000
learning_rate = 0.00001
decay_rate = learning_rate / num_epochs
num_nodes = 1024

model_2 = Sequential()
model_2.add(Dense(num_nodes, activation='relu', input_shape=(2048,)))
model_2.add(Dropout(0.8))
model_2.add(Dense(num_nodes, activation='relu'))
model_2.add(Dropout(0.8))
model_2.add(Dense(num_nodes, activation='relu'))
model_2.add(Dropout(0.8))
model_2.add(Dense(5, activation='softmax'))
opt = Adam(learning_rate=learning_rate, decay=decay_rate)
loss_func = SparseCategoricalCrossentropy()
tensorboard = TensorBoard(log_dir='logs/{}'.format(time()))
# NOTE: to use categorical cross entropy, I had to subtract 1 from the consensus to make the ratings from 0 to 4
model_2.compile(optimizer=opt, loss=loss_func, metrics=['accuracy'])

In [15]:
train_size = math.floor(len(im_cap_concat) * 0.8)
print(train_size)

test_size = len(im_cap_concat) - train_size
print(test_size)

model_2.fit(im_cap_concat[:train_size], labels[:train_size], batch_size=batch_size, epochs=num_epochs, \
            validation_split=0.2, callbacks=[tensorboard])



56862
14216
Epoch 1/1000
Instructions for updating:
use `tf.profiler.experimental.stop` instead.
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
E

<tensorflow.python.keras.callbacks.History at 0x7f90dc766ad0>

In [16]:
predictions = model_2.predict(im_cap_concat[train_size:])

In [17]:
test_loss, test_accuracy = model_2.evaluate(im_cap_concat[train_size:], labels[train_size:])



In [18]:
train_loss, train_accuracy = model_2.evaluate(im_cap_concat[:train_size], labels[:train_size])



In [19]:
print(predictions[0])

predicted_ratings = []
for pred in predictions:
    predicted_ratings.append(np.argmax(pred)+1)

print(len(predicted_ratings))

# Need to add back the 1 that was subtracted for training
actual_ratings = [int(num) + 1 for num in labels[train_size:]]
print(len(actual_ratings))

[3.8904678e-03 1.7867030e-01 8.1045967e-01 6.6648726e-03 3.1466523e-04]
14216
14216


In [20]:
def find_accuracy(predicted, actual):
    totals_by_rating = [0, 0, 0, 0, 0, 0]
    correct_by_rating = [0, 0, 0, 0, 0, 0]
    num_correct = 0
    num_ratings = 0
    
    for pred, act in zip(predicted, actual):
        totals_by_rating[int(act)] += 1
        if pred == act:
            num_correct += 1
            correct_by_rating[int(act)] += 1
        num_ratings += 1
        
    print('Accuracy = ', num_correct / num_ratings)
    
    print('---------------------------')
    print('Accuracy by rating 1 - 5')
    print('Test set size=', sum(totals_by_rating))
    print('Number Correct=', sum(correct_by_rating))
    for idx, (correct, total) in enumerate(zip(correct_by_rating, totals_by_rating)):
        if idx == 0:
            continue
        if total == 0:
            continue
        print('Rating = ', idx, '| Accuracy = ', correct/total)
    
    
find_accuracy(predicted_ratings, actual_ratings)

Accuracy =  0.7260832864378165
---------------------------
Accuracy by rating 1 - 5
Test set size= 14216
Number Correct= 10322
Rating =  1 | Accuracy =  0.817904993909866
Rating =  2 | Accuracy =  0.47796988287785835
Rating =  3 | Accuracy =  0.6560196560196561
Rating =  4 | Accuracy =  0.7729978354978355
Rating =  5 | Accuracy =  0.7914417887432537


In [21]:
print(predicted_ratings)

[1, 3, 3, 1, 3, 3, 3, 3, 5, 2, 4, 1, 2, 4, 3, 1, 1, 3, 2, 1, 1, 1, 1, 5, 3, 5, 2, 5, 1, 3, 1, 5, 1, 1, 2, 4, 1, 5, 2, 2, 1, 2, 3, 1, 4, 1, 1, 5, 4, 3, 5, 4, 2, 2, 5, 2, 2, 2, 3, 4, 5, 2, 5, 1, 3, 1, 1, 5, 1, 2, 5, 3, 5, 2, 3, 4, 4, 4, 5, 5, 2, 2, 5, 5, 3, 1, 1, 2, 1, 3, 1, 2, 5, 5, 5, 4, 1, 2, 1, 1, 4, 1, 4, 3, 1, 2, 3, 1, 3, 2, 2, 1, 1, 1, 5, 2, 1, 1, 2, 5, 1, 1, 3, 4, 2, 3, 1, 4, 3, 3, 3, 1, 2, 5, 5, 4, 3, 2, 1, 1, 5, 2, 1, 4, 5, 4, 2, 3, 3, 5, 1, 5, 1, 1, 1, 1, 4, 4, 3, 1, 3, 1, 2, 1, 1, 3, 3, 4, 2, 3, 4, 3, 4, 3, 5, 3, 5, 1, 3, 1, 1, 1, 4, 4, 1, 1, 4, 3, 3, 3, 3, 5, 1, 3, 1, 5, 5, 1, 3, 2, 1, 4, 1, 4, 1, 3, 2, 2, 4, 5, 4, 2, 1, 2, 3, 1, 2, 4, 2, 1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 5, 1, 3, 3, 1, 1, 5, 5, 1, 3, 1, 3, 1, 3, 5, 1, 5, 1, 5, 5, 3, 1, 1, 2, 3, 4, 1, 2, 2, 1, 1, 5, 2, 5, 3, 5, 4, 1, 1, 4, 5, 1, 2, 1, 2, 3, 2, 4, 3, 1, 4, 1, 3, 4, 2, 2, 1, 2, 5, 3, 5, 4, 1, 3, 5, 4, 4, 1, 1, 5, 1, 4, 3, 1, 1, 3, 5, 2, 5, 5, 2, 4, 5, 2, 4, 1, 2, 5, 1, 4, 2, 5, 1, 1, 5, 3, 1, 2, 5, 3, 4, 1, 1, 2, 

Using keras tuner for hyperaparameter tuning

In [None]:
#!pip install -U keras-tuner

In [None]:
import kerastuner as kt

def build_model(hp):
    inputs = tf.keras.Input(shape=(2048,))
    x = inputs
    for _ in range(3):
        x = tf.keras.layers.Dense(512, activation='relu')(x)
        x = tf.keras.layers.Dropout(hp.Float('dropout', 0, 0.3, step=0.05, default=0.2))(x)
    
    outputs = tf.keras.layers.Dense(5, activation='softmax')(x)

    model = tf.keras.Model(inputs, outputs)
    model.compile(optimizer=tf.keras.optimizers.Adam(hp.Float('learning_rate', 0.00001, 0.00002, step=0.000005)),loss='sparse_categorical_crossentropy',metrics=['accuracy'])
    tensorboard = TensorBoard(log_dir='logs/{}'.format(time()))
    return model
                
                
#tuner = kt.Hyperband(build_model, objective='val_accuracy', max_epochs=300, hyperband_iterations=2, directory='augmented_3layer')

tuner = kt.RandomSearch(build_model, objective='val_accuracy', max_trials=50, directory='random_search_sept_20')

tuner.search(im_cap_concat[:train_size], labels[:train_size], epochs=300, validation_split=0.2, callbacks=[EarlyStopping('val_loss', patience=3)])


In [None]:
best_model = tuner.get_best_models(1)[0]

In [None]:
best_hyperparameters = tuner.get_best_hyperparameters(1)[0]
print(best_hyperparameters.values)

In [None]:
tuner.get_best_models()[0].summary()

In [None]:
tuner.results_summary()