<a href="https://colab.research.google.com/github/ankitstar01/ml-learning/blob/master/poetry_seq_to_seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import string

import tensorflow as tf
from tensorflow import keras
from keras.models import Model
from keras.layers import Dense,Embedding,LSTM,Input
from keras.optimizers import Adam,SGD
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences



MAX_SEQ_LEN=100
MAX_VOCAB_SIZE=3000
EMBEDDING_DIM=50
VALIDATION_SPLIT=0.2
BATCH_SIZE=128
EPOCHS=2000
LATEN_DIM=25

In [2]:
#loading the data
input_data=[]
output_data=[]

for line in open('./robert_frost.txt'):
    line=line.rstrip()
    if not line:
        continue
    
    input_line='<sos>'+line
    output_line=line+'<eos>'
    
    input_data.append(input_line)
    output_data.append(output_line)
all_line=input_data+output_data

In [4]:
tokenizer=Tokenizer(num_words=MAX_VOCAB_SIZE,filters='')
tokenizer.fit_on_texts(all_line)
input_seq=tokenizer.texts_to_sequences(input_data)
output_seq=tokenizer.texts_to_sequences(output_data)
max_seq_len=max(len(s) for s in input_seq)
print('Max Seq Len {}'.format(max_seq_len))

Max Seq Len 11


In [6]:
#word to index mapping
word2idx=tokenizer.word_index
print('total unique tokens %s '%len(word2idx))
# assert('<sos>' in word2idx)
# assert('<eos>' in word2idx)

max_seq_len=min(MAX_SEQ_LEN,max_seq_len)
input_seq=pad_sequences(input_seq,maxlen=max_seq_len,padding='post')
output_seq=pad_sequences(output_seq,maxlen=max_seq_len,padding='post')
print('shape of input tensor : ',input_seq.shape)

total unique tokens 4614 
shape of input tensor :  (1436, 11)


In [9]:
import requests

url = 'http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip'
r = requests.get(url, stream=True)
open('glove.zip', 'wb').write(r.content)
!unzip ./glove.zip

Archive:  ./glove.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [12]:
print('loading word vector')
word2vec={}
with open(os.path.join('./glove.6B.%sd.txt'%EMBIDDING_DIM)) as f:
    for line in f:
        value=line.split()
        word=value[0]
        vec=np.asarray(value[1:],dtype='float32')
        word2vec[word]=vec
print('found %s word vectors'%len(word2vec))
    

loading word vector
found 400000 word vectors


In [15]:
print('filling pretrained embedding')
num_words=min(MAX_VOCAB_SIZE,len(word2idx)+1)
embedding_matrix=np.zeros((num_words,EMBEDDING_DIM))
for word,i in word2idx.items():
  if i<MAX_VOCAB_SIZE:
    embedding_vec=word2vec.get(word)
    if embedding_vec is not None:
      embedding_matrix[i]=embedding_vec

filling pretrained embedding
