In [1]:
from numpy import array
from pickle import dump
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

  from ._conv import register_converters as _register_converters
Using Theano backend.


In [3]:
def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

def save_doc(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

In [30]:
caps = load_doc('captions.txt').split('\n')
tokens = []
for cap in caps:
    for word in cap.split():
        #if word is not 'STOP':
        tokens.append(word)

length = 10

# organize into sequences of 10 tokens
sequences = list()
for i in range(length, len(tokens)):
    # select sequence of tokens
    seq = tokens[i-length:i]
    # convert into a line
    line = ' '.join(seq)
    # store
    sequences.append(line)
    
print(sequences[:10])
print('Total Sequences: %d' % len(sequences))

['web_site spiderman getting roasted STOP monitor love my country STOP', 'spiderman getting roasted STOP monitor love my country STOP screw', 'getting roasted STOP monitor love my country STOP screw science', 'roasted STOP monitor love my country STOP screw science man', 'STOP monitor love my country STOP screw science man is', 'monitor love my country STOP screw science man is back', 'love my country STOP screw science man is back STOP', 'my country STOP screw science man is back STOP stupa', 'country STOP screw science man is back STOP stupa in', 'STOP screw science man is back STOP stupa in thor']
Total Sequences: 25000


In [31]:
out_filename = 'edited_caps.txt'
save_doc(sequences, out_filename)

In [32]:
#convert back to lists of tokens for the tokenizer
lines = []
for seq in sequences:
    lines.append(seq.split())

# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)

# vocabulary size
vocab_size = len(tokenizer.word_index) + 1

# separate into input and output
sequences = array(sequences)

X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1]

In [33]:
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=seq_length))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 9, 50)             253450    
_________________________________________________________________
lstm_3 (LSTM)                (None, 9, 100)            60400     
_________________________________________________________________
lstm_4 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_3 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_4 (Dense)              (None, 5069)              511969    
Total params: 916,319
Trainable params: 916,319
Non-trainable params: 0
_________________________________________________________________
None


In [43]:
# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit model
model.fit(X, y, batch_size=128, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50

KeyboardInterrupt: 

In [44]:
# save the model to file
model.save('caption_model_3.h5')
# save the tokenizer
dump(tokenizer, open('caption_tokenizer_3.pkl', 'wb'))

In [45]:
from random import randint
from pickle import load
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences

# load the model
model = load_model('caption_model_3.h5')
tokenizer = load(open('caption_tokenizer_3.pkl', 'rb'))

# load cleaned text sequences
in_filename = 'padded_captions.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')

#classification_result should be the first predicted type generated from gerard's classifier
classification_result = ''
#loop through lines and find a caption that begins with the same class
#use that as seed_text

seq_length = 9
seed_text = 'STOP STOP STOP STOP STOP STOP STOP STOP STOP amphibian '
print('seed text: \n')
print(seed_text + '\n')
#seq_length = len(seed_text)

n_words = 10

result = list()
in_text = seed_text
# generate a fixed number of words
for _ in range(n_words):
    # encode the text as integer
    encoded = tokenizer.texts_to_sequences([in_text])[0]
    # truncate sequences to a fixed length
    encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
    # predict probabilities for each word
    
    #USE LINE BELOW TO IGNORE THE STOP WORD FROM BEING CHOSEN IF WE WANT.
    #OTHERWISE JUST MORE TRAINING WORKED TO GET RID OF TOO HIGH PROB GIVEN TO NULL VALUES
    #print(model.predict_proba(encoded)) 
    
    yhat = model.predict_classes(encoded, verbose=0)
    # map predicted word index to word
    out_word = ''
    for word, index in tokenizer.word_index.items():
        if index == yhat:
            out_word = word
            break
    # append to input
    in_text += ' ' + out_word
    result.append(out_word)
    
meme_caption = ' '.join(result)
print('generated caption: \n')
print(meme_caption)

seed text: 

STOP STOP STOP STOP STOP STOP STOP STOP STOP amphibian 

generated caption: 

STOP you a childhood that i raise your organs STOP
