<a href="https://colab.research.google.com/github/andy311p/Udemy_advanced_NLP/blob/master/NLP_poetry_generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Installations
!pip install kaggle
!cp drive/My\ Drive/nlp_course/kaggle.json /root/.kaggle/
!git clone https://github.com/lazyprogrammer/machine_learning_examples.git

In [4]:
#Imports and HyperParameters
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from keras.models import Model
from keras.layers import Dense, Input, Embedding, LSTM
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.optimizers import Adam, SGD

MAX_SEQUENCE_LENGTH = 100
MAX_VOCAB_SIZE = 3000
EMBEDDING_DIM = 50
VALIDATION_SPLIT = 0.2
BATCH_SIZE = 128
EPOCHS = 2000
LATENT_DIM = 25

In [5]:
#load the data
input_texts = []
target_texts = []
i = 0
for line in open('./drive/My Drive/nlp_course/machine_learning_examples/hmm_class/robert_frost.txt','r'):
  line = line.rstrip()
  if line == '':
    continue
  input_texts.append('<sos> ' + line)
  target_texts.append(line + ' <eos>')
  # input_line = '<SOS> ' + line
  # target_line = line + ' <EOS>'
all_texts = input_texts + target_texts

#PRE PROCESS DATA
#create a tokenizer for the texts and convert them into numbers
#filters empty in order not to filter the <sos> and <eos> since they are special tokens
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, filters='')
tokenizer.fit_on_texts(all_texts)
input_sequences = tokenizer.texts_to_sequences(input_texts) #tokenized sentences
target_sequences = tokenizer.texts_to_sequences(target_texts) #tokenized sentences

#in order to pad sequences
max_data_seq_length = max(len(s) for s in input_sequences)
print('Max sequence length: {}'.format(max_data_seq_length))

#get the word->index dictionary
word2idx = tokenizer.word_index
idx2word = {v:w for w,v in word2idx.items()} #vector to word dictionary to be used when generating poems
print('Found {} unique tokens'.format(len(word2idx)))
assert('<sos>' in word2idx)#sanity check
assert('<eos>' in word2idx)#sanity check

max_sequene_length = min(MAX_SEQUENCE_LENGTH,max_data_seq_length)
input_sequences = pad_sequences(input_sequences,maxlen=max_sequene_length) #padded tokenized sentences
target_sequences = pad_sequences(target_sequences,maxlen=max_sequene_length) #padded tokenized sentences
print('Shape of data tensor: ', input_sequences.shape)

Max sequence length: 12
Found 3056 unique tokens
Shape of data tensor:  (1436, 12)


In [6]:
#load word vectors
print("Loading word vectors START")
word2vec = {}
with open(os.path.join('drive/My Drive/nlp_course/glove.6B.%sd.txt' % EMBEDDING_DIM)) as f:
  for line in f:
    line = line.split()
    word2vec[line[0]] = np.asarray(line[1:],dtype='float32')
print("Loaded %s word vectors" % len(word2vec))

Loading word vectors START
Loaded 400000 word vectors


In [7]:
#Prepare embedding matrix
num_words = min(MAX_VOCAB_SIZE, len(word2idx) + 1) #the actual size of vocabulary. +1 for padding
embedding_matrix = np.zeros((num_words,EMBEDDING_DIM))
for w,i in word2idx.items():
  if i< MAX_VOCAB_SIZE:
    if w in word2vec:
      embedding_matrix[i] = word2vec[w]

In [8]:
#one-hot the targets (can't use sparse cross-entropy)
one_hot_targets = np.zeros((len(input_sequences), max_sequene_length,num_words))
for i,target_sequence in enumerate(target_sequences):
  for j,word in enumerate(target_sequence):
    if word > 0:
      one_hot_targets[i,j,word] = 1

In [9]:
#Embedding layer
embedding_layer = Embedding(
    num_words,
    EMBEDDING_DIM,
    weights=[embedding_matrix]
    #input_length=MAX_SEQUENCE_LENGTH,
    #trainable=False
)

In [10]:
#training model architecture
input_ = Input(shape=(max_sequene_length,))
x = embedding_layer(input_)
initial_h = Input(shape=(LATENT_DIM,))
initial_c = Input(shape=(LATENT_DIM,))

lstm = LSTM(LATENT_DIM,return_sequences=True,return_state=True)
x, _, _ = lstm(x, initial_state=[initial_h, initial_c])#return sequences and states are not needed here

dense = Dense(num_words,activation='softmax')
output = dense(x)

model = Model([input_,initial_h,initial_c], output)
model.compile(
    loss='categorical_crossentropy',
    optimizer=Adam(lr=0.01),
    metrics=['accuracy']
)

In [11]:
print("Training time.....")
z = np.zeros((len(input_sequences),LATENT_DIM))
r = model.fit(
    [input_sequences,z,z],
    one_hot_targets,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_split=VALIDATION_SPLIT
)

Training time.....
Epoch 1/400
Epoch 2/400
Epoch 3/400
Epoch 4/400
Epoch 5/400
Epoch 6/400
Epoch 7/400
Epoch 8/400
Epoch 9/400
Epoch 10/400
Epoch 11/400
Epoch 12/400
Epoch 13/400
Epoch 14/400
Epoch 15/400
Epoch 16/400
Epoch 17/400
Epoch 18/400
Epoch 19/400
Epoch 20/400
Epoch 21/400
Epoch 22/400
Epoch 23/400
Epoch 24/400
Epoch 25/400
Epoch 26/400
Epoch 27/400
Epoch 28/400
Epoch 29/400
Epoch 30/400
Epoch 31/400
Epoch 32/400
Epoch 33/400
Epoch 34/400
Epoch 35/400
Epoch 36/400
Epoch 37/400
Epoch 38/400
Epoch 39/400
Epoch 40/400
Epoch 41/400
Epoch 42/400
Epoch 43/400
Epoch 44/400
Epoch 45/400
Epoch 46/400
Epoch 47/400
Epoch 48/400
Epoch 49/400
Epoch 50/400
Epoch 51/400
Epoch 52/400
Epoch 53/400
Epoch 54/400
Epoch 55/400
Epoch 56/400
Epoch 57/400
Epoch 58/400
Epoch 59/400
Epoch 60/400
Epoch 61/400
Epoch 62/400
Epoch 63/400
Epoch 64/400
Epoch 65/400
Epoch 66/400
Epoch 67/400
Epoch 68/400
Epoch 69/400
Epoch 70/400
Epoch 71/400
Epoch 72/400
Epoch 73/400
Epoch 74/400
Epoch 75/400
Epoch 76/400
Ep

In [None]:
# plot some data
plt.plot(r.history['loss'], label='loss')
plt.plot(r.history['val_loss'], label='val_loss')
plt.legend()
plt.show()

# accuracies
plt.plot(r.history['accuracy'], label='acc')
plt.plot(r.history['val_accuracy'], label='val_acc')
plt.legend()
plt.show()

In [12]:
#sampling model architecture
#we use the layers from previos model
input2 = Input(shape=(1,)) #input here is the word generated previously
x = embedding_layer(input2)
x, h, c = lstm(x, initial_state=[initial_h, initial_c]) #now we need the h and c
output2 = dense(x)

sampling_model = Model([input2,initial_h,initial_c], [output2,h,c])


In [20]:
#function that generates a single line of poem
def sample_line():
  #initial inputs
  np_input = np.array( [[ word2idx["<sos>"] ]] ) #initial input is an 1*1 aray with vector of <sos> token  
  h = np.zeros((1,LATENT_DIM)) 
  c = np.zeros((1,LATENT_DIM)) 
  
  eos = word2idx["<eos>"]

  output_sentence = []
  for _ in range(max_sequene_length):
    o,h,c = sampling_model.predict([np_input,h,c])

    probs = o[0,0] #get the probabilities for the words
    if np.argmax(probs) == 0:
      print("something is worng") # the 0 index should not get the highest probability. if so, then something is wrong
    probs[0] = 0 #we set the 0 index to 0 just to be sure
    probs /= probs.sum() #then we normalize the probs
    idx = np.random.choice(len(probs),p=probs) #sample the next word
    if idx == eos:
      break

    #accumulate the output line      
    output_sentence.append(idx2word.get(idx, '<WRONG IDX %s>' % idx))

    #set the predicted word as the next input
    np_input[0,0] = idx
  return(" ".join(output_sentence))
  

In [21]:
#generate a 4 line poem
for _ in range(4):
  print(sample_line())

i let you have now given its hand.
when i choked on the time longer.
toffile with live summer match green chanced,
i want to have to myself
