In [1]:
import numpy as np
import os
import pickle
from tqdm import tqdm
import keras
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.layers import Input, Dense, add, LSTM, Embedding, Dropout, Conv2D, MaxPooling2D, BatchNormalization, Flatten
from tensorflow.keras.models import Model, Sequential
import tensorflow as tf

In [3]:
base_dir = r'D:\University Files\Assignments\7th Semester\Machine Learning\Project\VizWiz Dataset'
train_img_dir = r'D:\University Files\Assignments\7th Semester\Machine Learning\Project\VizWiz Dataset\train'
val_img_dir = r'D:\University Files\Assignments\7th Semester\Machine Learning\Project\VizWiz Dataset\val'
test_img_dir = r'D:\University Files\Assignments\7th Semester\Machine Learning\Project\VizWiz Dataset\test'
train_annot_dir = r'D:\University Files\Assignments\7th Semester\Machine Learning\Project\VizWiz Dataset\annotations\train.json'
val_annot_dir = r'D:\University Files\Assignments\7th Semester\Machine Learning\Project\VizWiz Dataset\annotations\val.json'
test_annot_dir = r'D:\University Files\Assignments\7th Semester\Machine Learning\Project\VizWiz Dataset\annotations\test.json'

In [4]:
import json

with open(train_annot_dir, 'r') as file:
    train_dict = json.load(file)

with open(val_annot_dir, 'r') as file:
    val_dict = json.load(file)

with open(test_annot_dir, 'r') as file:
    test_dict = json.load(file)

In [5]:
train_img_mapping = {}

for indiv_dict in train_dict['images']:
    img_name = indiv_dict['file_name']
    img_id = indiv_dict['id']
    train_img_mapping[img_name] = img_id

In [6]:
val_img_mapping = {}

for indiv_dict in val_dict['images']:
    img_name = indiv_dict['file_name']
    img_id = indiv_dict['id']
    val_img_mapping[img_name] = img_id

In [7]:
train_set = []
validation_set = []

 # FIGURE OUT A WAY TO USE CUSTOM CONVNET

In [8]:
HEIGHT = 224
WIDTH = 224

shape = (HEIGHT, WIDTH, 3)

Initial Convolutional Neural Network

In [12]:
def define_model(neurons, dense_layers, bn, dropouts):
  model = Sequential()

  for i, nodes in enumerate(neurons):
    if i == 0:
      model.add(
          Conv2D(nodes, (3, 3), input_shape=shape, activation='relu'))
      model.add(MaxPooling2D(pool_size=(2, 2)))
      if bn:
        model.add(BatchNormalization())
    else:
      model.add(Conv2D(nodes, (3, 3), activation='relu'))
      model.add(MaxPooling2D(pool_size=(2, 2)))

  model.add(Flatten())

  for i, nodes in enumerate(dense_layers):
    model.add(Dense(nodes, activation='relu'))
    model.add(Dropout(dropouts[i]))

  model.add(Dense(1, activation='sigmoid'))
  model.compile(loss="binary_crossentropy",
                optimizer='adam', metrics=["accuracy"])
  return model

In [None]:
es = tf.keras.callbacks.EarlyStopping(
    monitor='val_accuracy',
    patience=5,
    verbose=1,
    mode='max'
)

cp = tf.keras.callbacks.ModelCheckpoint(
    'best_aug.h5',
    monitor='val_accuracy',
    mode='max',
    verbose=1,
    save_best_only=True
)

history = model.fit(X_train, y_train, validation_data=(
    X_val, y_val), batch_size=132, epochs=30, callbacks=[es, cp], verbose=2)

In [18]:
# Define VGG16 Model
model = VGG16()
model = Model(inputs=model.inputs, outputs=model.layers[-2].output)
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 56, 56, 128)       0     

In [19]:
def get_features(model, split_set, set_dict):
    image_features = {}
    directory = os.path.join(base_dir, split_set)

    for img_name in tqdm(os.listdir(directory)):
        if not (img_name in set_dict.keys()): continue
        # Load an image
        img_path = os.path.join(directory, img_name)
        img = load_img(img_path, target_size=(HEIGHT, WIDTH))
        # Convert image into numpy pixel values
        img = img_to_array(img)
        # Reshape the data for the model
        img = img.reshape((1, img.shape[0], img.shape[1], img.shape[2]))
        # Preprocess
        img = preprocess_input(img)
        # Extract features
        feature = model.predict(img, verbose=0)
        # Map img_id with its features
        img_id = set_dict[img_name]
        image_features[img_id] = feature
    return image_features

In [20]:
train_img_features = get_features(model, 'train', train_img_mapping)

 34%|███▍      | 8211/23954 [1:21:10<1:57:15,  2.24it/s]

In [None]:
val_img_features = get_features(model, 'val', val_img_mapping)

In [None]:
train_img_features_path = r'D:\University Files\Assignments\7th Semester\Machine Learning\Project\loaded_data\train_img_features.pkl'
val_img_features_path = r'D:\University Files\Assignments\7th Semester\Machine Learning\Project\loaded_data\val_img_features.pkl'

In [None]:
import pickle 
pickle.dump(train_img_features, train_img_features_path, 'wb')
pickle.dump(val_img_features, val_img_features_path, 'wb')

In [None]:
t = train_img_features
v = val_img_features

In [None]:
with open(train_img_features_path, 'rb') as f:
    train_img_features = pickle.load(f)
with open(val_img_features_path, 'rb') as f:
    val_img_features = pickle.load(f)

In [15]:
def get_captions(img_name, dict_set):
    img_captions = []
    for d in dict_set['images']:
        encountered = False
        if d.get('file_name') == img_name:
            img_id = d.get('id')
            for k in dict_set['annotations']:
                if k.get('image_id') != img_id and encountered: break
                if k.get('image_id') == img_id:
                    encountered = True
                    if k.get('is_rejected') == False:
                        img_captions.append(k.get('caption'))
                        print(k.get('caption'))
                        
    return img_captions

In [16]:
train_img_to_captions = {}

for img_name, img_id in train_img_mapping.items():
    if img_id not in train_img_to_captions:
        train_img_to_captions[img_id] = get_captions(img_name, train_dict)

train_img_to_captions

ITS IS A BASIL LEAVES CONTAINER ITS CONTAINS THE NET WEIGHT TOO.
A green and white plastic condiment bottle containing Basil leaves.
A bottle of spices in a plastic container laying on a surface.
some basil leaves in a container on a counter
A can of Coca Cola on a counter is shown for when one can use a nice, cold drink.
A black can of Coca Cola Zero calorie soda is on the counter near the coffee maker.
A kitchen counter the various items on top including a can of Coca-Cola, metal containers, and a teapot.
a black tin of Coca Cola placed on a black surface
Black counter with canisters, kettle and can of soda.
A can of crushed tomatoes are on a brown surface, the tomatoes read crushed tomatoes on the brand.
A can of crushed tomatoes sitting on a beige colored counter.
a can of crushed tomatoes in puree from price chopper.
a Price Chopper branded can of crushed tomatoes
Image is a can of crushed tomatoes in view.
A white screen with a captcha that needs to be completed.
A screenshot of 

In [None]:
# Do it for validation

In [None]:
def preprocess_text(mapping):
    for key, captions in mapping.items():
        for i in range(len(captions)):
            caption = caption[i]
            caption = caption.lower()
            caption = caption.replace('[^A-Za-z]', '')
            caption = caption.replace('\s+', '')
            caption = '<start> ' + ' '.join([word for word in caption.split() if len(word) > 1]) + ' <end>'
            captions[i] = caption

In [None]:
# Before preprocessing captions:
train_img_to_captions[0]

In [None]:
# After preprocessing
preprocess_text(train_img_to_captions)
train_img_to_captions[0]

In [None]:
# Get all vocabulary 

all_captions = []

for key in mapping.keys():
    for caption in mapping[key]:
        all_captions.append(caption)

len(all_captions)

In [None]:
# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index) + 1
vocab_size

In [None]:
# get max length of a caption:
max_length = max(len(caption.split()) for caption in all_captions)
max_length