In [43]:
import tensorflow as tf
from tensorflow.compat.v1.train import AdamOptimizer
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.callbacks import LambdaCallback, ModelCheckpoint, EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, LSTM, Embedding, BatchNormalization, Bidirectional
from tensorflow.keras.utils import get_file, to_categorical
from tensorflow.keras.losses import categorical_crossentropy
from tensorflow.keras.metrics import categorical_accuracy
import numpy as np
import pickle
import random
import sys
import io
import os

sys.path.append('/.')  # add parent directory to Python path for layers.py access
from AttentionWithContext import AttentionWithContext

In [44]:
# Import pickled text corpus and text tokenizer:
# This 

with open ('./corpus_eng_new', 'rb') as fp:
    corpus_tokenized = pickle.load(fp)

with open ('./tokenizer_new', 'rb') as fp:
    word_tokenizer = pickle.load(fp)

In [46]:
def generate_train_dataset(window, tokenized_corpus=corpus_tokenized):
    """
    A function to create training data of set sequence length WINDOW from a tokenized corpus.
    Inputs: 
    window is an integer (must be smaller than the lengths of all songs in the corpus)
    tokenized_corpus is a list of songs and each song is a list of tokens
    
    Outputs: 
    X is an array of width WINDOW-1 corresponding to sequences of that length.
    y is an array of width 1 corresponding to each next token after the last token in each sequence in X.
    """
    train_data = []
    train_label = []
    for song in tokenized_corpus:
        len_song = len(song)        
        for i in range(0,len_song-window+1):
            train_data.append(song[i:i+window-1])
            train_label.append(song[i+window-1])
    return np.array(train_data), np.array(train_label)

In [47]:
WINDOW = 15
VOCAB_SIZE = len(word_tokenizer.word_index)

In [48]:
X, y = generate_train_dataset(WINDOW)

In [49]:
# Create dictionaries for unique indices given to each word in vocab
word2num = word_tokenizer.word_index
num2word = {v:k for k, v in word2num.items()} 


In [50]:
# Manually convert 2D X to array of numerical indices

def words2seq (array, dictionary=word2num):
  new_array=np.zeros(array.shape)
  for i in range(array.shape[0]):
      for j in range(array.shape[1]):
        new_array[i][j] = dictionary[array[i][j]]
  new_array = new_array.astype(int)     
  return new_array


In [51]:
# convert 1D y array to numbers
def words2seq_flat (array, dictionary=word2num):
  new_array=np.zeros(array.shape)
  for i in range(array.shape[0]):
      if array[i] in dictionary.keys():
        new_array[i] = dictionary[array[i]]
      else:
        new_array[i] = 0
  new_array = new_array.astype(int)                          
  return new_array

In [52]:
# Converts X and y from strings to numerical equivalents 
Xnum = words2seq(X)
ynum= words2seq_flat(y)


In [53]:
def is_word_in_dict(words, word_index=word2num.keys()):
  """
  Takes either a string or a numpy array of words as input and checks if each 
  word is in the given word index.
  """
  if type(words) == 'str':
    for word in words.split():
      if word not in word_index:
          return False
  elif type(words) == 'np.ndarray':
    for word in words:
      if word not in word_index:
          return False
  return True

In [54]:
def string_to_vec(string, dict=word2num):
  """
  Converts a string to a vector using the given word index dictionary
  """
  return np.array([word2num[word] for word in string.split()])

In [55]:
embeddings_index = dict()
f = open('./glove.6B.100d.txt')

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs

f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400001 word vectors.


In [56]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((VOCAB_SIZE+1, 100))

for word, i in word_tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


In [58]:
# CALLBACK FUNCTIONS
# Include the epoch in the file name (uses `str.format`)
checkpoint_path = "training_2/cp-{epoch:04d}.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create a callback that saves the model's weights every 5 epochs
cp_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path, 
    verbose=0, 
    save_weights_only=True,
    save_freq=5)

# Callback function for early stopping
es_callback = tf.keras.callbacks.EarlyStopping(monitor='loss', min_delta=0.002, patience=3, verbose=10)

# Callback function for printing model predictions with seed every 3 epochs
def on_epoch_end(epoch, logs, Xnum=Xnum):
    # Function invoked at end of each epoch. Prints generated text.
    if epoch % 3 != 0:
        return
    # create a seed
    seed_all = [word2num[word] for word in ['all', 'i', 'want', 'is', 'peace', 'and', 'love']]
    seed = seed_all
    seed = " ".join(num2word[num] for num in seed)
    #generated = seed
    generated= ''
    print(f'\nGenerating text after Epoch: {epoch} with seed: \n{seed}\n')
    sys.stdout.write(generated)

    generated = generate_song(seed)

    sys.stdout.write(generated+'\n')
    sys.stdout.flush()

print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

In [59]:
X_input = Xnum
y_input = to_categorical(ynum,VOCAB_SIZE+1)
EMBEDDING_DIMS = 100
NUM_EPOCHS = 200
BATCH_SIZE = 500
callbacks= [cp_callback, es_callback, print_callback]
adam = AdamOptimizer()

In [60]:
model = Sequential()

#embedding layer with pretrained vectors, not trainable
model.add(Embedding(VOCAB_SIZE+1,100,weights=[embedding_matrix],input_length=WINDOW-1,trainable=False))
model.add(BatchNormalization())
model.add(Bidirectional(LSTM(10, dropout=0.4, return_sequences=True)))
model.add(AttentionWithContext())
model.add(Dense(100, activation='relu'))
model.add(BatchNormalization())
model.add(Dense(VOCAB_SIZE+1, activation='softmax')) 
# vocab_size +1 to avoid out of bound
# compile the model
model.compile(optimizer=adam, loss='categorical_crossentropy', metrics='accuracy')
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 14, 100)           519100    
_________________________________________________________________
batch_normalization_2 (Batch (None, 14, 100)           400       
_________________________________________________________________
bidirectional_1 (Bidirection (None, 14, 20)            8880      
_________________________________________________________________
attention_with_context_1 (At (None, 20)                440       
_________________________________________________________________
dense_2 (Dense)              (None, 100)               2100      
_________________________________________________________________
batch_normalization_3 (Batch (None, 100)               400       
_________________________________________________________________
dense_3 (Dense)              (None, 5191)             

In [61]:
def generate_song(seed, model=model, index_word=word_tokenizer.index_word , word_index=word_tokenizer.word_index, vocab_size=VOCAB_SIZE, generated_length=50):
    """
    A function to generate a song of a given token length for a given imput and model
    """
    string_returned = seed
    if is_word_in_dict(seed, word_index):
      X = string_to_vec(seed)
      counter = 1
      
      #print(string_returned)
 
      for i in range(generated_length):
          y_pred = model.predict(X)
          pred_choice = random.choices(range(1,vocab_size+1), weights=y_pred[-1][:-1], k=1)
          word_predict = num2word[pred_choice[0]]
          #print(word_predict)
          string_returned += " " + word_predict
          #print(string_returned)
          X = ' '.join(string_returned.split()[counter:])
          #print(X)
          X = np.array([word2num[word] for word in X if word in word2num.keys()])
          counter += 1
      return string_returned

In [62]:
history = model.fit(X_input, y_input, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS, callbacks=callbacks, validation_split=0.2, verbose=1)

Epoch 1/200
Generating text after Epoch: 0 with seed: 
all i want is peace and love

all i want is peace and love echoes lesson under he's my wild stones fix beat beautiful victims find salvation own spotlight howling day glory pump ooh nightmares every popular sometimes ride popular code w bras grab life after spend read return become you've does popular take faster lips oh-oh as down you've glory called getting as
Epoch 2/200
Epoch 3/200
Epoch 4/200
Generating text after Epoch: 3 with seed: 
all i want is peace and love

all i want is peace and love oh ti dance really long flame ti la eternally long flame own forgotten really scared own reality god really it's flame ti know keeps flame really own lights somewhere own breathe own ti knew the had the own reality hold reality everyone world's flame really long my don't reality left
Epoch 5/200
Epoch 6/200
Epoch 7/200
Generating text after Epoch: 6 with seed: 
all i want is peace and love

all i want is peace and love i scare own really 

In [None]:
with open('./history', 'wb') as file_pi:
        pickle.dump(history.history, file_pi)

In [None]:
model.save('./model')

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: ./model_tues/assets
