In [None]:
from os import listdir
from glob import glob
from numpy import array
from keras.preprocessing.text import Tokenizer, one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Sequential
from keras.utils import to_categorical
from keras.layers.convolutional import Conv2D
from keras.optimizers import RMSprop
from keras.callbacks import ModelCheckpoint
from keras.layers import Embedding, TimeDistributed, RepeatVector, LSTM, concatenate , Input, Reshape, Dense, Flatten, Dropout
from keras.preprocessing.image import array_to_img, img_to_array, load_img
from keras.applications.inception_resnet_v2 import InceptionResNetV2, preprocess_input
import numpy as np

In [None]:
# Load the images and preprocess them for inception-resnet
images = []
all_images = glob('data/jpeg/mal*')
all_images = all_images[0:5]

In [None]:
all_images.sort()
for filename in all_images:
    images.append(img_to_array(load_img(filename, target_size=(299, 299))))
images = np.array(images, dtype=float)

In [None]:
all_html = [im.replace("data/jpeg", "data/html").replace(".jpeg", ".html") for im in all_images]

text = []
for filename in all_html:
    with open(filename, "r") as f:
        text_ = "<START> " + f.read() + " <END>"
        text_ = ' '.join(text_.split())
        text_ = text_.replace(',', ' ,')
        text.append(text_)

In [None]:
all_images[0]

In [None]:
# Initialize the function to create the vocabulary 
tokenizer = Tokenizer(filters='', split=" ", lower=False)
# Create the vocabulary 
tokenizer.fit_on_texts(text)
# Add +1 to leave space for empty words
vocab_size = len(tokenizer.word_index) + 1
# Translate each word in text file to the matching vocabulary index
train_sequences = tokenizer.texts_to_sequences(text)
# The longest HTML file
max_sequence = max(len(s) for s in train_sequences)
max_length = 48

In [None]:
def preprocess_data(sequences, features):
    X, y, image_data = list(), list(), list()
    for img_no, seq in enumerate(sequences):
        for i in range(1, len(seq)):
            # Add the sentence until the current count(i) and add the current count to the output
            in_seq, out_seq = seq[:i], seq[i]
            # Pad all the input token sentences to max_sequence
            in_seq = pad_sequences([in_seq], maxlen=max_sequence)[0]
            # Turn the output into one-hot encoding
            out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
            # Add the corresponding image to the boostrap token file
            image_data.append(features[img_no])
            # Cap the input sentence to 48 tokens and add it
            X.append(in_seq[-48:])
            y.append(out_seq)
    return np.array(X), np.array(y), np.array(image_data)
X, y, image_data = preprocess_data(train_sequences, images)

In [None]:
#Create the encoder
image_model = Sequential()
image_model.add(Conv2D(16, (3, 3), padding='valid', activation='relu', input_shape=(299, 299, 3,)))
image_model.add(Conv2D(16, (3, 3), activation='relu', padding='same', strides=2))
image_model.add(Conv2D(32, (3, 3), activation='relu', padding='same'))
image_model.add(Conv2D(32, (3, 3), activation='relu', padding='same', strides=2))
image_model.add(Conv2D(64, (3, 3), activation='relu', padding='same'))
image_model.add(Conv2D(64, (3, 3), activation='relu', padding='same', strides=2))
image_model.add(Conv2D(128, (3, 3), activation='relu', padding='same'))

image_model.add(Flatten())
image_model.add(Dense(1024, activation='relu'))
image_model.add(Dropout(0.3))
image_model.add(Dense(1024, activation='relu'))
image_model.add(Dropout(0.3))

image_model.add(RepeatVector(max_length))

visual_input = Input(shape=(299, 299, 3,))
encoded_image = image_model(visual_input)

language_input = Input(shape=(max_length,))
language_model = Embedding(vocab_size, 50, input_length=max_length, mask_zero=True)(language_input)
language_model = LSTM(128, return_sequences=True)(language_model)
language_model = LSTM(128, return_sequences=True)(language_model)

#Create the decoder
decoder = concatenate([encoded_image, language_model])
decoder = LSTM(512, return_sequences=True)(decoder)
decoder = LSTM(512, return_sequences=False)(decoder)
decoder = Dense(vocab_size, activation='softmax')(decoder)

# Compile the model
model = Model(inputs=[visual_input, language_input], outputs=decoder)
optimizer = RMSprop(lr=0.0001, clipvalue=1.0)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [None]:
#Save the model for every 2nd epoch
filepath="org-weights-epoch-{epoch:04d}--val_loss-{val_loss:.4f}--loss-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_weights_only=True, period=2)
callbacks_list = [checkpoint]

In [None]:
# Train the model
model.fit([image_data, X], y, batch_size=1, shuffle=False, 
          validation_split=0.1, callbacks=callbacks_list, verbose=1, epochs=50)

# Batched approach

In [1]:
from os import listdir
from glob import glob
from numpy import array
from keras.preprocessing.text import Tokenizer, one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Sequential
from keras.utils import to_categorical
from keras.layers.convolutional import Conv2D
from keras.optimizers import RMSprop
from keras.callbacks import ModelCheckpoint
from keras.layers import Embedding, TimeDistributed, RepeatVector, LSTM, concatenate , Input, Reshape, Dense, Flatten, Dropout
from keras.preprocessing.image import array_to_img, img_to_array, load_img
from keras.applications.inception_resnet_v2 import InceptionResNetV2, preprocess_input
import numpy as np

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def get_image(fname):
    loaded_ = load_img(fname, target_size=(299, 299))
    return img_to_array(loaded_)

In [3]:
def read_all_html():
    all_html = glob("data/html_train/*html")
    all_html.sort()
    text = []
    for filename in all_html:
        with open(filename, "r") as f:
            text_ = "<START> " + f.read() + " <END>"
            text_ = ' '.join(text_.split())
            text_ = text_.replace(',', ' ,')
            text.append(text_)
    return text

In [4]:
def build_vocab(html_text):
    # Initialize the function to create the vocabulary 
    tokenizer = Tokenizer(filters='', split=" ", lower=False)
    # Create the vocabulary 
    tokenizer.fit_on_texts(html_text)
    # Add +1 to leave space for empty words
    vocab_size = len(tokenizer.word_index) + 1
    # Translate each word in text file to the matching vocabulary index
    train_sequences = tokenizer.texts_to_sequences(html_text)
    # The longest HTML file
    max_sequence = max(len(s) for s in train_sequences)
    max_length = 48
    return train_sequences, max_sequence, max_length, vocab_size

In [5]:
def preprocess_data(sequences, features, max_sequence, vocab_size):
    X, y, image_data = list(), list(), list()
    for img_no, seq in enumerate(sequences):
        for i in range(1, len(seq)):
            # Add the sentence until the current count(i) and add the current count to the output
            in_seq, out_seq = seq[:i], seq[i]
            # Pad all the input token sentences to max_sequence
            in_seq = pad_sequences([in_seq], maxlen=max_sequence)[0]
            # Turn the output into one-hot encoding
            out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
            # Add the corresponding image to the boostrap token file
            image_data.append(features[img_no])
            # Cap the input sentence to 48 tokens and add it
            X.append(in_seq[-48:])
            y.append(out_seq)
    return np.array(X), np.array(y), np.array(image_data)

In [7]:
def batch_generator(train_sequences, max_sequence, vocab_size, jpeg_files, batch_size):
    i = 0
    while True:
        if i >= len(jpeg_files):
            i = 0
        out = []
        print("i = %s" % i)
        images = [get_image(_) for _ in jpeg_files[i:i+batch_size]]
        print("n images = %s"  % len(images))
        X, y, image_data = preprocess_data(train_sequences[i:i+batch_size], images, max_sequence, vocab_size)
        i += batch_size
        yield [image_data, X],y

In [None]:
html_ = read_all_html()
train_sequences, max_sequence, max_length, vocab_size = build_vocab(html_)
jpeg_files = glob("data/jpeg/*.jpeg")
jpeg_files.sort()

In [8]:
n_training = 2*len(train_sequences)//3

train_sequences_train = train_sequences[0:n_training]
jpeg_files_train = jpeg_files[0:n_training]

train_sequences_val = train_sequences[n_training:]
jpeg_files_val = jpeg_files[n_training:]

In [9]:
#Create the encoder
image_model = Sequential()
image_model.add(Conv2D(16, (3, 3), padding='valid', activation='relu', input_shape=(299, 299, 3,)))
image_model.add(Conv2D(16, (3, 3), activation='relu', padding='same', strides=2))
image_model.add(Conv2D(32, (3, 3), activation='relu', padding='same'))
image_model.add(Conv2D(32, (3, 3), activation='relu', padding='same', strides=2))
image_model.add(Conv2D(64, (3, 3), activation='relu', padding='same'))
image_model.add(Conv2D(64, (3, 3), activation='relu', padding='same', strides=2))
image_model.add(Conv2D(128, (3, 3), activation='relu', padding='same'))

image_model.add(Flatten())
image_model.add(Dense(1024, activation='relu'))
image_model.add(Dropout(0.3))
image_model.add(Dense(1024, activation='relu'))
image_model.add(Dropout(0.3))

image_model.add(RepeatVector(48))

visual_input = Input(shape=(299, 299, 3,))
encoded_image = image_model(visual_input)

language_input = Input(shape=(48,))
language_model = Embedding(vocab_size, 50, input_length=48, mask_zero=True)(language_input)
language_model = LSTM(128, return_sequences=True)(language_model)
language_model = LSTM(128, return_sequences=True)(language_model)

#Create the decoder
decoder = concatenate([encoded_image, language_model])
decoder = LSTM(512, return_sequences=True)(decoder)
decoder = LSTM(512, return_sequences=False)(decoder)
decoder = Dense(vocab_size, activation='softmax')(decoder)

# Compile the model
model = Model(inputs=[visual_input, language_input], outputs=decoder)
optimizer = RMSprop(lr=0.0001, clipvalue=1.0)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [10]:
model.fit_generator(batch_generator(train_sequences_train, max_sequence, vocab_size, jpeg_files_train, 2), 
                    validation_data = batch_generator(train_sequences_val, max_sequence, vocab_size, jpeg_files_val, 2),
                    validation_steps = 2,#len(jpeg_files) - n_training,
                    steps_per_epoch=2, 
                    epochs=5, 
                    verbose=1,
                    max_queue_size=3)

i = 0Epoch 1/5

n images = 2
i = 2
n images = 2
i = 4
n images = 2
i = 6
n images = 2
n images = 2
i = 0
n images = 2
i = 2
n images = 2
i = 4
n images = 2
i = 6
n images = 2
i = 8
n images = 2
Epoch 2/5
i = 10
n images = 2


KeyboardInterrupt: 