## Text Generation 1

In [2]:
import string
import numpy as np
from numpy import array, argmax
from pickle import load
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils import plot_model
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from keras.layers import Dropout
from keras.layers.merge import add
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from keras.optimizers import SGD
import random 

In [3]:
# loading text

# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

In [4]:
#print(doc.split("\n"))

In [5]:
# extract descriptions for images
def load_descriptions(doc):
    
    # preprocessing so load_descriptions works
    text = ""
    textList = doc.split("\n")
    for i in range(len(textList)):
        text = text + str(i) + " " + textList[i] + "\n"
    doc = text

    mapping = dict()
    # process lines
    for line in doc.split('\n'):
        # split line by white space
        tokens = line.split()
        if len(line) < 2:
            continue
        # take the first token as the image id, the rest as the description
        image_id, image_desc = tokens[0], tokens[1:]
        # remove filename from image id
        image_id = image_id.split('.')[0]
        # convert description tokens back to string
        image_desc = ' '.join(image_desc)
        # create the list if needed
        if image_id not in mapping:
            mapping[image_id] = list()
        # store description
        mapping[image_id].append(image_desc)
    return mapping

In [6]:
#print(descriptions)

In [7]:
def clean_descriptions(descriptions):
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    for key, desc_list in descriptions.items():
        for i in range(len(desc_list)):
            desc = desc_list[i]
            # tokenize
            desc = desc.split()
            # convert to lower case
            desc = [word.lower() for word in desc]
            # remove punctuation from each token
            desc = [w.translate(table) for w in desc]
            # remove hanging 's' and 'a'
            desc = [word for word in desc if len(word)>1]
            # remove tokens with numbers in them
            desc = [word for word in desc if word.isalpha()]
            # store as string
            desc_list[i] =  ' '.join(desc)

In [8]:
#print(descriptions)

In [9]:
# convert the loaded descriptions into a vocabulary of words
def to_vocabulary(descriptions):
    # build a list of all description strings
    all_desc = set()
    for key in descriptions.keys():
        [all_desc.update(d.split()) for d in descriptions[key]]
    return all_desc

In [10]:
# save descriptions to file, one per line
def save_descriptions(descriptions, filename):
    lines = list()
    for key, desc_list in descriptions.items():
        for desc in desc_list:
            lines.append(key + ' ' + desc)
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

In [11]:
# load clean descriptions into memory
def load_clean_descriptions(filename, dataset):
    # load document
    doc = load_doc(filename)
    descriptions = dict()
    for line in doc.split('\n'):
        # split line by white space
        tokens = line.split()
        # split id from description
        image_id, image_desc = tokens[0], tokens[1:]
        # skip images not in the set
        if image_id in dataset:
            # create list
            if image_id not in descriptions:
                descriptions[image_id] = list()
            # wrap description in tokens
            desc = 'startseq ' + ' '.join(image_desc) + ' endseq'
            # store
            descriptions[image_id].append(desc)
    return descriptions

In [12]:
#print(load_clean_descriptions('descriptions.txt', train))

In [13]:
# convert a dictionary of clean descriptions to a list of descriptions
def to_lines(descriptions):
    all_desc = list()
    for key in descriptions.keys():
        [all_desc.append(d) for d in descriptions[key]]
    return all_desc

# fit a tokenizer given caption descriptions
def create_tokenizer(descriptions):
    lines = to_lines(descriptions)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

# prepare tokenizer
#tokenizer = create_tokenizer(train_descriptions)
#vocab_size = len(tokenizer.word_index) + 1
#print('Vocabulary Size: %d' % vocab_size)

In [14]:
# create sequences of images, input sequences and output words for an image
def create_sequences(tokenizer, max_length, descriptions, vocab_size):
    X1, X2, y = list(), list(), list()
    # walk through each image identifier
    for key, desc_list in descriptions.items():
        # walk through each description for the image
        for desc in desc_list:
            # encode the sequence
            seq = tokenizer.texts_to_sequences([desc])[0]
            # split one sequence into multiple X,y pairs
            for i in range(1, len(seq)):
                # split into input and output pair
                in_seq, out_seq = seq[:i], seq[i]
                # pad input sequence
                in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                # encode output sequence
                out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                # store
                #X1.append(photos[key][0])
                X2.append(in_seq)
                y.append(out_seq)
    #return array(X1), array(X2), array(y)
    return array(X2), array(y)


# calculate the length of the description with the most words
def max_length_func(descriptions):
    #print(descriptions)
    lines = to_lines(descriptions)
    #print(lines[0])
    return max(len(d.split()) for d in lines)

In [15]:
def load_set(filename):
    doc = load_doc(filename)
    train = []
    for i in range(len(doc.split("\n"))):
        train.append(str(i))
        
    return(train)

### Stuff to run:

In [16]:
filename = 'Reddit_Religion_short.txt'
#filename = 'Reddit_Long_short.txt'

# load descriptions
doc = load_doc(filename)

# parse descriptions
descriptions = load_descriptions(doc)
print('Loaded: %d ' % len(descriptions))

# clean descriptions
clean_descriptions(descriptions)

# summarize vocabulary
vocabulary = to_vocabulary(descriptions)
print('Vocabulary Size: %d' % len(vocabulary))

# save descriptions
save_descriptions(descriptions, filename+'descriptions.txt')

Loaded: 201 
Vocabulary Size: 1742


In [17]:
# TRAIN data
 
# load training dataset (6K)
filename = "Reddit_Religion_short.txt"
train = load_set(filename)[:150]
print('Dataset: %d' % len(train))

# descriptions
train_descriptions = load_clean_descriptions(filename+'descriptions.txt', train)
print('Descriptions: train=%d' % len(train_descriptions))

# prepare tokenizer
tokenizer = create_tokenizer(train_descriptions)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

# determine the maximum sequence length
max_length = max_length_func(train_descriptions)
print('Description Length: %d' % max_length)

# prepare sequences
X2train, ytrain = create_sequences(tokenizer, max_length, train_descriptions, vocab_size)

Dataset: 150
Descriptions: train=150
Vocabulary Size: 1562
Description Length: 348


In [18]:
# TEST data

# load training dataset (6K)
filename = "Reddit_Religion_short.txt"
test = load_set(filename)[151:]
print('Dataset: %d' % len(test))

# descriptions
test_descriptions = load_clean_descriptions(filename+'descriptions.txt', test)
print('Descriptions: train=%d' % len(test_descriptions))

# prepare tokenizer
#tokenizer = create_tokenizer(test_descriptions)
#vocab_size = len(tokenizer.word_index) + 1
#print('Vocabulary Size: %d' % vocab_size)

# determine the maximum sequence length (or: use same as in training data)
#max_length = max_length_func(test_descriptions)
#print('Description Length: %d' % max_length)

# prepare sequences
X2test, ytest = create_sequences(tokenizer, max_length, test_descriptions, vocab_size)

Dataset: 50
Descriptions: train=50


In [19]:
# TAKES A LONG TIME TO RUN
# download pre-made embeddings 
embeddings_index = dict()
f = open('glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [20]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size, 100)) # the word embeddings are vecs length 100 
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        
# define embedding 
e = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_length, trainable=False)

In [21]:
opt = SGD(lr=0.01, momentum=0.9, decay=0.01)

In [22]:
# define the captioning model
def define_model(vocab_size, max_length):
    # sequence model
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    #se1 = e(inputs2)
    se2 = LSTM(256)(se1)
    se3 = Dropout(0.5)(se2)
    #se3 = LSTM(128)(se2)
    
    # decoder model
    #decoder1 = add([se2, se3])
    decoder2 = Dense(256, activation='relu')(se3)
    #decoder2 = Dense(128, activation='relu')(se3)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)
    
    # tie it together [image, seq] [word]
    model = Model(inputs=[inputs2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    #model.compile(loss='categorical_crossentropy', optimizer=opt)
    
    # summarize model
    print(model.summary())
    plot_model(model, to_file='model.png', show_shapes=True)
    return model

In [23]:
print(max_length)

348


In [24]:
# define the model
model = define_model(vocab_size, max_length)

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 348)]             0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 348, 256)          399872    
_________________________________________________________________
lstm (LSTM)                  (None, 256)               525312    
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense (Dense)                (None, 256)               65792     
_________________________________________________________________
dense_1 (Dense)              (None, 1562)              401434    
Total params: 1,392,410
Trainable params: 1,392,410
Non-trainable params: 0
____________________________________________

In [25]:
# define checkpoint callback
filepath = 'training2/model-ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}.h5'
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

In [26]:
# fit model
model.fit([X2train], ytrain, epochs=20, verbose=2, callbacks=[checkpoint], validation_data=([X2test], ytest))
#model.fit([X2train], ytrain, epochs=1, verbose=2, callbacks=[checkpoint], validation_data=([X2train], ytrain))

Epoch 1/20

Epoch 00001: val_loss improved from inf to 5.77440, saving model to training2/model-ep001-loss6.393-val_loss5.774.h5
221/221 - 154s - loss: 6.3930 - val_loss: 5.7744
Epoch 2/20

Epoch 00002: val_loss improved from 5.77440 to 5.76736, saving model to training2/model-ep002-loss5.998-val_loss5.767.h5
221/221 - 180s - loss: 5.9978 - val_loss: 5.7674
Epoch 3/20

Epoch 00003: val_loss improved from 5.76736 to 5.57441, saving model to training2/model-ep003-loss5.797-val_loss5.574.h5
221/221 - 182s - loss: 5.7971 - val_loss: 5.5744
Epoch 4/20

Epoch 00004: val_loss improved from 5.57441 to 5.52768, saving model to training2/model-ep004-loss5.630-val_loss5.528.h5
221/221 - 183s - loss: 5.6299 - val_loss: 5.5277
Epoch 5/20

Epoch 00005: val_loss improved from 5.52768 to 5.48504, saving model to training2/model-ep005-loss5.454-val_loss5.485.h5
221/221 - 184s - loss: 5.4539 - val_loss: 5.4850
Epoch 6/20

Epoch 00006: val_loss improved from 5.48504 to 5.45972, saving model to training2/

<tensorflow.python.keras.callbacks.History at 0x7f27901e1370>

## Generating 

In [27]:

from keras.preprocessing.text import Tokenizer
from pickle import dump


# load training dataset (6K)
train = load_set(filename)
print('Dataset: %d' % len(train))

# descriptions
train_descriptions = load_clean_descriptions(filename+'descriptions.txt', train)
print('Descriptions: train=%d' % len(train_descriptions))

# prepare tokenizer
tokenizer = create_tokenizer(train_descriptions)
# print(tokenizer.word_index[:10])

# save the tokenizer
dump(tokenizer, open('tokenizer.pkl', 'wb'))

Dataset: 201
Descriptions: train=201


In [1]:
# map an integer to a word
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

# generate a description for an image
#def generate_desc(model, tokenizer, photo, max_length):
def generate_desc(model, tokenizer, seed, max_length):
    # seed the generation process
    in_text = 'startseq ' + seed
    # iterate over the whole length of the sequence
    for i in range(max_length):
        # integer encode input sequence
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        # pad input
        sequence = pad_sequences([sequence], maxlen=max_length)
        # predict next word
        #yhat = model.predict([photo,sequence], verbose=0)
        yhat = model.predict([sequence], verbose=0)
        # convert probability to integer
        yhat = argmax(yhat)
        # map integer to word
        word = word_for_id(yhat, tokenizer)
        # stop if we cannot map the word
        if word is None:
            break
        # append as input for generating the next word
        in_text += ' ' + word
        # stop if we predict the end of the sequence
        if word == 'endseq':
            break
    return in_text

In [2]:
# load the tokenizer
#tokenizer = load(open('tokenizer.pkl', 'rb'))
tokenizer = create_tokenizer(train_descriptions)
# pre-define the max sequence length (from training)
#max_length = 40

NameError: name 'create_tokenizer' is not defined

In [3]:
# load the model
#model = load_model('model-ep020-loss0.795-val_loss0.595.h5') #inturrupting sheep
model = load_model('training2/model-ep006-loss5.293-val_loss5.460.h5')
model.summary()

NameError: name 'load_model' is not defined

In [31]:
# generate description

seed = "knock knock who"

description = generate_desc(model, tokenizer, seed, max_length)
print(description)

InvalidArgumentError:  indices[0,345] = 1695 is not in [0, 1562)
	 [[node functional_1/embedding_1/embedding_lookup (defined at <ipython-input-28-c238889e2909>:21) ]] [Op:__inference_predict_function_18494]

Errors may have originated from an input operation.
Input Source operations connected to node functional_1/embedding_1/embedding_lookup:
 functional_1/embedding_1/embedding_lookup/16838 (defined at /usr/lib/python3.8/contextlib.py:113)

Function call stack:
predict_function


In [170]:
model2 = load_model('model-ep003-loss4.148-val_loss3.852.h5')
model2.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 230)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 230, 256)          542464    
_________________________________________________________________
dropout (Dropout)            (None, 230, 256)          0         
_________________________________________________________________
lstm (LSTM)                  (None, 256)               525312    
_________________________________________________________________
dense (Dense)                (None, 256)               65792     
_________________________________________________________________
dense_1 (Dense)              (None, 2119)              544583    
Total params: 1,678,151
Trainable params: 1,678,151
Non-trainable params: 0
____________________________________________