<a href="https://colab.research.google.com/github/abhranil-datascience/LSTM_POC/blob/master/PoetryGenerationPOC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
############################## Mount Drive ######################################## 
from google.colab import drive
drive.mount('/content/gdrive')
############################## Change Directory ###################################
import os
os.chdir('/content/gdrive/My Drive/MLandDLFullCourse/DL/AdvancedNLP/3.SeqToSeq/PoetryGeneration')
########################### Import Statements ####################################
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from keras.models import Model
from keras.layers import Dense, Embedding, Input, LSTM
from keras.optimizers import Adam, SGD
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
########################### Variable Initialization ##############################
MaxVocabSize=20000
MaxSequenceLength=0
EMBEDDING_DIM = 50
PreTrainedVectorFileName="glove.6B."+str(EMBEDDING_DIM)+"d.txt"
PreTrainedVectorFilePath="/content/gdrive/My Drive/MLandDLFullCourse/DL/AdvancedNLP/1.ToxicComments/Downloads/Glove6BUnzipped/"+PreTrainedVectorFileName
############################### Tokenize Input ###################################
def TokenizeInput(Dataset):
  tokenizer=Tokenizer(num_words=MaxVocabSize,filters='')
  tokenizer.fit_on_texts(Dataset)
  return tokenizer

####################### Create Int To Word Mapping ##############################
def CreateIntToWordMapping(DataTokenizer):
  idx2word={val:word for word,val in DataTokenizer.word_index.items() if val<=MaxVocabSize}
  return idx2word
  
############################### Pad Sequences ###################################
def PadSequences(Sequence):
  PaddedSequence=pad_sequences(sequences=Sequence,maxlen=MaxSequenceLength,padding='post')
  return PaddedSequence

############################ Create Embedding Matrix #############################
def CreateEmbeddingMatrix(DataDictionary,Int2WordMapping,MaxNumOfWords):
  EmbeddingMatrix=np.zeros((MaxNumOfWords,EMBEDDING_DIM))
  for item in Int2WordMapping:
    EmbeddedValue=None
    CurrentWord=Int2WordMapping[item]
    try:
      EmbeddedValue=DataDictionary[CurrentWord]
    except:
      ()
    if EmbeddedValue is not None:
      EmbeddingMatrix[item-1]=EmbeddedValue
  return EmbeddingMatrix

############################## Read Dataset ######################################
input_text=[]
target_text=[]
for line in open('robert_frost.txt'):
  line=line.rstrip()
  if not line:
    continue
  input_line='<sos> '+line
  target_line=line + ' <eos>'
  input_text.append(input_line)
  target_text.append(target_line)
  
########### Tokenize Dataset ###############
all_lines = input_text + target_text
tokenizer=TokenizeInput(all_lines)
input_sequences=tokenizer.texts_to_sequences(input_text)
target_sequences=tokenizer.texts_to_sequences(target_text)
MaxSequenceLength=max(max(len(s) for s in input_sequences),max(len(s) for s in target_sequences))
print("Max Sequence Length: "+str(MaxSequenceLength))
print("")

##### Create Word To Int Mapping ###########
word2idx = tokenizer.word_index
print('Found %s unique tokens.' % len(word2idx))
print("")
assert('<sos>' in word2idx)
assert('<eos>' in word2idx)

##### Create Int To Word Mapping ###########
IntToWordMapping=CreateIntToWordMapping(tokenizer)

########### Pad Sequences #################
input_sequence=PadSequences(input_sequences)
target_sequence=PadSequences(target_sequences)

############ Load Pretrained Vectors ###########################
word2vec={}
with open(PreTrainedVectorFilePath) as f:
  for line in f:
    values=line.split()
    word=values[0]
    int_val=np.array(values[1:],dtype=float)
    word2vec[word]=int_val
print("Pretrained Vectors Loaded !! Size : "+str(len(word2vec)))
print("")

########### Create Embedding Matrix #####################
num_of_words=min(MaxVocabSize,len(word2idx))
print("Maximum Number Of Words: "+str(num_of_words))
print("")
EmbeddingMatrix=CreateEmbeddingMatrix(word2vec,IntToWordMapping,num_of_words)

############### One Hot Targets #############################
Values=[]
for word,value in word2idx.items():
  Values.append(value)
ValuesArray=np.array(Values,dtype=object).reshape(len(Values),1)
ohe=OneHotEncoder(categorical_features=[0])
EncodedValuesArray=ohe.fit_transform(ValuesArray).toarray()
EncodedValuesArray=np.concatenate((ValuesArray,EncodedValuesArray),axis=1)
InvalidWords=np.zeros(EncodedValuesArray.shape[1]-1)
TargetArray=np.ndarray((len(input_sequence),MaxSequenceLength,len(EncodedValuesArray)))
for row in range(0,len(target_sequence)):
  for col in range(0, len(target_sequence[row])):
    current_word=target_sequence[row][col]
    if current_word > 0:
      current_word_encoded=EncodedValuesArray[:,1:][EncodedValuesArray[:,0]==current_word]
      TargetArray[row,col,:]=current_word_encoded
    else:
      TargetArray[row,col,:]=InvalidWords

#################### Create Model ###################################
print("Building Model !! ")
print("")

LATENT_DIM = 25
EMBEDDING_DIM = 50
input_= Input(shape=(MaxSequenceLength,))
initial_h = Input(shape=(LATENT_DIM,))
initial_c = Input(shape=(LATENT_DIM,))
EmbeddingLayer = Embedding(input_dim=num_of_words,output_dim=EMBEDDING_DIM,weights=[EmbeddingMatrix],trainable=True)
lstm = LSTM(LATENT_DIM, return_sequences=True, return_state=True)
dense = Dense(num_of_words, activation='softmax')

x = EmbeddingLayer(input_)
x, _, _ = lstm(x, initial_state=[initial_h, initial_c])
output = dense(x)

model=Model([input_,initial_h,initial_c],output)
model.compile(loss='categorical_crossentropy',optimizer=Adam(lr=0.01),metrics=['accuracy'])

print("Fitting Model !! ")
print("")

z=np.zeros((len(input_sequence),LATENT_DIM))
model.fit([input_sequence,z,z],TargetArray,batch_size=128,epochs=2000,validation_split=0.2)

################### Create Prediction Model ############################
test_input=Input(shape=(1,))
x=EmbeddingLayer(test_input)
output,h,c=lstm(x,initial_state=[initial_h, initial_c])
output=dense(output)
SamplingModel=Model([test_input,initial_h,initial_c],[output,h,c])


################## Use Prediction Model ############################
def SampleLine():
  np_input=np.array([[word2idx['<sos>']]])
  h=np.zeros((1,LATENT_DIM))
  c=np.zeros((1,LATENT_DIM))
  EOS=word2idx['<eos>']
  output_sentence=[]
  for _ in range(MaxSequenceLength):
    o,h,c=SamplingModel.predict([np_input,h,c])
    probs=o[0,0]
    if _ == 0:
      MostProbableWord=np.random.choice(len(probs), p=probs)+1
    else:
      MostProbableWord=np.argmax(probs)+1
    np_input=MostProbableWord
    if MostProbableWord==EOS:
      break;
    PredictedWord=IntToWordMapping[MostProbableWord]
    output_sentence.append(PredictedWord)
    np_input = np.array([[MostProbableWord]])
  PoemLine=" ".join(output_sentence)
  PoemLine.strip()
  print(PoemLine)
  print("")

################# Generate Sample Line ############################
while True:
  print("---------------Start Of Poem--------------------")
  print("")
  for _ in range(4):
    SampleLine()
  print("---------------End Of Poem--------------------")
  print("")
  ans = input("---generate another? [Y/n]---")
  if ans and ans[0].lower().startswith('n'):
    break

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
Max Sequence Length: 12

Found 3056 unique tokens.

Pretrained Vectors Loaded !! Size : 400000

Maximum Number Of Words: 3056

Building Model !! 

Fitting Model !! 

Train on 1148 samples, validate on 288 samples
Epoch 1/2000
Epoch 2/2000
Epoch 3/2000
Epoch 4/2000
Epoch 5/2000
Epoch 6/2000
Epoch 7/2000
Epoch 8/2000
Epoch 9/2000
Epoch 10/2000
Epoch 11/2000
Epoch 12/2000
Epoch 13/2000
Epoch 14/2000
Epoch 15/2000
Epoch 16/2000
Epoch 17/2000
Epoch 18/2000
Epoch 19/2000
Epoch 20/2000
Epoch 21/2000
Epoch 22/2000
Epoch 23/2000
Epoch 24/2000
Epoch 25/2000
Epoch 26/2000
Epoch 27/2000
Epoch 28/2000
Epoch 29/2000
Epoch 30/2000
Epoch 31/2000
Epoch 32/2000
Epoch 33/2000
Epoch 34/2000
Epoch 35/2000
Epoch 36/2000
Epoch 37/2000
Epoch 38/2000
Epoch 39/2000
Epoch 40/2000
Epoch 41/2000
Epoch 42/2000
Epoch 43/2000
Epoch 44/2000
Epoch 45/2000
Epoch 46/2000
Epoch 47/2000
Epoch 4