In [None]:
# https://machinelearningmastery.com/develop-a-deep-learning-caption-generation-model-in-python/

In [115]:
import string
import numpy as np
from numpy import array, argmax
from pickle import load
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils import plot_model
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from keras.layers import Dropout
from keras.layers.merge import add
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from keras.optimizers import SGD
import random 

In [80]:
# loading text

# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

In [4]:
#print(doc.split("\n"))

In [81]:
# extract descriptions for images
def load_descriptions(doc):
    
    # preprocessing so load_descriptions works
    text = ""
    textList = doc.split("\n")
    for i in range(len(textList)):
        text = text + str(i) + " " + textList[i] + "\n"
    doc = text

    mapping = dict()
    # process lines
    for line in doc.split('\n'):
        # split line by white space
        tokens = line.split()
        if len(line) < 2:
            continue
        # take the first token as the image id, the rest as the description
        image_id, image_desc = tokens[0], tokens[1:]
        # remove filename from image id
        image_id = image_id.split('.')[0]
        # convert description tokens back to string
        image_desc = ' '.join(image_desc)
        # create the list if needed
        if image_id not in mapping:
            mapping[image_id] = list()
        # store description
        mapping[image_id].append(image_desc)
    return mapping

In [26]:
#print(descriptions)

{'0': ["I got an A in philosophy last semester by proving that my professor doesn't exist."], '1': ['Copy-editing is a very stressful line of work. Every time one of us misses a period, we get really nervous.'], '2': ["I own the world's worst thesaurus. Not only is it awful, it's awful."], '3': ['']}


In [82]:
def clean_descriptions(descriptions):
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    for key, desc_list in descriptions.items():
        for i in range(len(desc_list)):
            desc = desc_list[i]
            # tokenize
            desc = desc.split()
            # convert to lower case
            desc = [word.lower() for word in desc]
            # remove punctuation from each token
            desc = [w.translate(table) for w in desc]
            # remove hanging 's' and 'a'
            desc = [word for word in desc if len(word)>1]
            # remove tokens with numbers in them
            desc = [word for word in desc if word.isalpha()]
            # store as string
            desc_list[i] =  ' '.join(desc)

In [83]:
#print(descriptions)

In [84]:
# convert the loaded descriptions into a vocabulary of words
def to_vocabulary(descriptions):
    # build a list of all description strings
    all_desc = set()
    for key in descriptions.keys():
        [all_desc.update(d.split()) for d in descriptions[key]]
    return all_desc

In [85]:
# save descriptions to file, one per line
def save_descriptions(descriptions, filename):
    lines = list()
    for key, desc_list in descriptions.items():
        for desc in desc_list:
            lines.append(key + ' ' + desc)
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

In [86]:
# load clean descriptions into memory
def load_clean_descriptions(filename, dataset):
    # load document
    doc = load_doc(filename)
    descriptions = dict()
    for line in doc.split('\n'):
        # split line by white space
        tokens = line.split()
        # split id from description
        image_id, image_desc = tokens[0], tokens[1:]
        # skip images not in the set
        if image_id in dataset:
            # create list
            if image_id not in descriptions:
                descriptions[image_id] = list()
            # wrap description in tokens
            desc = 'startseq ' + ' '.join(image_desc) + ' endseq'
            # store
            descriptions[image_id].append(desc)
    return descriptions

In [87]:
#print(load_clean_descriptions('descriptions.txt', train))

In [88]:
# convert a dictionary of clean descriptions to a list of descriptions
def to_lines(descriptions):
    all_desc = list()
    for key in descriptions.keys():
        [all_desc.append(d) for d in descriptions[key]]
    return all_desc

# fit a tokenizer given caption descriptions
def create_tokenizer(descriptions):
    lines = to_lines(descriptions)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

# prepare tokenizer
#tokenizer = create_tokenizer(train_descriptions)
#vocab_size = len(tokenizer.word_index) + 1
#print('Vocabulary Size: %d' % vocab_size)

In [89]:
# create sequences of images, input sequences and output words for an image
#def create_sequences(tokenizer, max_length, descriptions, photos, vocab_size):
def create_sequences(tokenizer, max_length, descriptions, vocab_size):
    X1, X2, y = list(), list(), list()
    # walk through each image identifier
    for key, desc_list in descriptions.items():
        # walk through each description for the image
        for desc in desc_list:
            # encode the sequence
            seq = tokenizer.texts_to_sequences([desc])[0]
            # split one sequence into multiple X,y pairs
            for i in range(1, len(seq)):
                # split into input and output pair
                in_seq, out_seq = seq[:i], seq[i]
                # pad input sequence
                in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                # encode output sequence
                out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                # store
                #X1.append(photos[key][0])
                X2.append(in_seq)
                y.append(out_seq)
    #return array(X1), array(X2), array(y)
    return array(X2), array(y)


# calculate the length of the description with the most words
def max_length_func(descriptions):
    #print(descriptions)
    lines = to_lines(descriptions)
    #print(lines[0])
    return max(len(d.split()) for d in lines)

In [90]:
def load_set(filename):
    doc = load_doc(filename)
    train = []
    for i in range(len(doc.split("\n"))):
        train.append(str(i))
        
    return(train)

### Stuff to run:

In [101]:
filename = 'Reddit_Religion_short.txt'
#filename = 'Reddit_Long_short.txt'

# load descriptions
doc = load_doc(filename)

# parse descriptions
descriptions = load_descriptions(doc)
print('Loaded: %d ' % len(descriptions))

# clean descriptions
clean_descriptions(descriptions)

# summarize vocabulary
vocabulary = to_vocabulary(descriptions)
print('Vocabulary Size: %d' % len(vocabulary))

# save descriptions
save_descriptions(descriptions, filename+'descriptions.txt')

Loaded: 201 
Vocabulary Size: 1742


In [102]:
# TRAIN data
 
# load training dataset (6K)
filename = "Reddit_Religion_short.txt"
train = load_set(filename)[:150]
print('Dataset: %d' % len(train))

# descriptions
train_descriptions = load_clean_descriptions(filename+'descriptions.txt', train)
print('Descriptions: train=%d' % len(train_descriptions))

# prepare tokenizer
tokenizer = create_tokenizer(train_descriptions)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

# determine the maximum sequence length
max_length = max_length_func(train_descriptions)
print('Description Length: %d' % max_length)

# prepare sequences
X2train, ytrain = create_sequences(tokenizer, max_length, train_descriptions, vocab_size)

Dataset: 150
Descriptions: train=150
Vocabulary Size: 1562
Description Length: 348


In [104]:
# TEST data

# load training dataset (6K)
filename = "Reddit_Religion_short.txt"
test = load_set(filename)[151:]
print('Dataset: %d' % len(test))

# descriptions
test_descriptions = load_clean_descriptions(filename+'descriptions.txt', test)
print('Descriptions: train=%d' % len(test_descriptions))

# prepare tokenizer
#tokenizer = create_tokenizer(test_descriptions)
#vocab_size = len(tokenizer.word_index) + 1
#print('Vocabulary Size: %d' % vocab_size)

# determine the maximum sequence length
#max_length = max_length_func(test_descriptions)
#print('Description Length: %d' % max_length)

# prepare sequences
X2test, ytest = create_sequences(tokenizer, max_length, test_descriptions, vocab_size)

Dataset: 50
Descriptions: train=50
Description Length: 215


In [14]:
# TAKES A LONG TIME TO RUN
# download pre-made embeddings 
embeddings_index = dict()
f = open('glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [105]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size, 100))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        
# define embedding 
e = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_length, trainable=False)

In [116]:
opt = SGD(lr=0.01, momentum=0.9, decay=0.01)

In [143]:
# define the captioning model
def define_model(vocab_size, max_length):
    # sequence model
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    #se1 = e(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)
    #se3 = LSTM(128)(se2)
    
    # decoder model
    #decoder1 = add([se2, se3])
    decoder2 = Dense(256, activation='relu')(se3)
    #decoder2 = Dense(128, activation='relu')(se3)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)
    
    # tie it together [image, seq] [word]
    model = Model(inputs=[inputs2], outputs=outputs)
    #model.compile(loss='categorical_crossentropy', optimizer='adam')
    model.compile(loss='categorical_crossentropy', optimizer=opt)
    
    # summarize model
    print(model.summary())
    plot_model(model, to_file='model.png', show_shapes=True)
    return model

In [144]:
print(max_length)

215


In [145]:
# define the model
model = define_model(vocab_size, max_length)

Model: "functional_17"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_9 (InputLayer)         [(None, 215)]             0         
_________________________________________________________________
embedding_5 (Embedding)      (None, 215, 256)          399872    
_________________________________________________________________
dropout_8 (Dropout)          (None, 215, 256)          0         
_________________________________________________________________
lstm_8 (LSTM)                (None, 256)               525312    
_________________________________________________________________
dense_16 (Dense)             (None, 256)               65792     
_________________________________________________________________
dense_17 (Dense)             (None, 1562)              401434    
Total params: 1,392,410
Trainable params: 1,392,410
Non-trainable params: 0
___________________________________________

In [146]:
# define checkpoint callback
filepath = 'training/model-ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}.h5'
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

In [147]:
# fit model
model.fit([X2train], ytrain, epochs=1, verbose=2, callbacks=[checkpoint], validation_data=([X2test], ytest))
#model.fit([X2train], ytrain, epochs=1, verbose=2, callbacks=[checkpoint], validation_data=([X2train], ytrain))


Epoch 00001: val_loss improved from inf to 7.32013, saving model to training/model-ep001-loss7.339-val_loss7.320.h5
221/221 - 159s - loss: 7.3393 - val_loss: 7.3201


<tensorflow.python.keras.callbacks.History at 0x7feef3d022b0>

## Generating 

In [20]:

from keras.preprocessing.text import Tokenizer
from pickle import dump


# load training dataset (6K)
train = load_set(filename)
print('Dataset: %d' % len(train))

# descriptions
train_descriptions = load_clean_descriptions('descriptions.txt', train)
print('Descriptions: train=%d' % len(train_descriptions))

# prepare tokenizer
tokenizer = create_tokenizer(train_descriptions)

# save the tokenizer
dump(tokenizer, open('tokenizer.pkl', 'wb'))

Dataset: 401
Descriptions: train=401


In [21]:
# map an integer to a word
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

# generate a description for an image
#def generate_desc(model, tokenizer, photo, max_length):
def generate_desc(model, tokenizer, seed, max_length):
    # seed the generation process
    in_text = 'startseq ' + seed
    # iterate over the whole length of the sequence
    for i in range(max_length):
        # integer encode input sequence
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        # pad input
        sequence = pad_sequences([sequence], maxlen=max_length)
        # predict next word
        #yhat = model.predict([photo,sequence], verbose=0)
        yhat = model.predict([sequence], verbose=0)
        # convert probability to integer
        yhat = argmax(yhat)
        # map integer to word
        word = word_for_id(yhat, tokenizer)
        # stop if we cannot map the word
        if word is None:
            break
        # append as input for generating the next word
        in_text += ' ' + word
        # stop if we predict the end of the sequence
        if word == 'endseq':
            break
    return in_text

In [22]:
# load the tokenizer
tokenizer = load(open('tokenizer.pkl', 'rb'))
# pre-define the max sequence length (from training)
#max_length = 40

In [148]:
# load the model
#model = load_model('model-ep020-loss0.795-val_loss0.595.h5') #inturrupting sheep
model = load_model('training/model-ep001-loss7.339-val_loss7.320.h5')
model.summary()

Model: "functional_17"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_9 (InputLayer)         [(None, 215)]             0         
_________________________________________________________________
embedding_5 (Embedding)      (None, 215, 256)          399872    
_________________________________________________________________
dropout_8 (Dropout)          (None, 215, 256)          0         
_________________________________________________________________
lstm_8 (LSTM)                (None, 256)               525312    
_________________________________________________________________
dense_16 (Dense)             (None, 256)               65792     
_________________________________________________________________
dense_17 (Dense)             (None, 1562)              401434    
Total params: 1,392,410
Trainable params: 1,392,410
Non-trainable params: 0
___________________________________________

In [133]:
model2 = load_model('training/model-ep004-loss3.689-val_loss3.341.h5')
model2.summary()

Model: "functional_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         [(None, 230)]             0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 230, 100)          211900    
_________________________________________________________________
dropout_4 (Dropout)          (None, 230, 100)          0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 256)               365568    
_________________________________________________________________
dense_8 (Dense)              (None, 256)               65792     
_________________________________________________________________
dense_9 (Dense)              (None, 2119)              544583    
Total params: 1,187,843
Trainable params: 975,943
Non-trainable params: 211,900
________________________________________

In [150]:
# generate description

seed = "and god said "

description = generate_desc(model2, tokenizer, seed, max_length)
print(description)

startseq and god said  the the you startseq he he endseq
