In [0]:
# Upload data
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

In [0]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential


# set seeds for reproducability
from tensorflow import set_random_seed
from numpy.random import seed
set_random_seed(2)
seed(1)

import pandas as pd
import numpy as np
import string, os 

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

  
df = pd.read_csv('quotes_file.txt',sep='\t',header= None, error_bad_lines=False)
quotes = df.iloc[:,0].values.tolist()
print(quotes[0:10])

def clean_text(txt):
    txt = "".join(v for v in txt if v not in string.punctuation).lower()
    txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt 

corpus = [clean_text(x) for x in quotes]
corpus[:10]

In [0]:
import keras
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import sys
import re
%matplotlib inline

n_samples = 200000
max_len = 20

def clean_sentences(sentences, lower = True):
    n_sentences=[]
    temp = ''
    for sentence in sentences:
#       print(sentence)
      sentence = sentence.strip()
      sentence = re.sub(r'[0-9#$%&*+():=?/]+', '', sentence)
      if lower==True:
        temp = "\t "+sentence.lower()+ " \n"
      else:
        temp = "\t "+sentence+ " \n"
      n_sentences.append(temp)
      temp=''
    return n_sentences
  
cleaned_quotes = clean_sentences(quotes, lower=True)

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(split=' ', char_level=True)
tokenizer.fit_on_texts(cleaned_quotes)
sequences = tokenizer.texts_to_sequences(cleaned_quotes)

word_index = tokenizer.word_index
print('Found %d unique tokens.' % len(word_index))

index2char = {}
for ch in tokenizer.word_index:
  index2char[tokenizer.word_index.get(ch)] = ch

In [0]:
test = sequences[0]
s = ""
for temp in test:
  if not temp==0:
    s = s+ index2char[temp]
    
cleaned_quotes[0],s

In [0]:
def generate_split(data, lookback= 20, step = 5):
  sentence = []
  next_char = []
  for d in data: 
    for i in range(lookback,len(d), step):
      sentence.append(d[i-lookback:i])
      next_char.append(d[i])
  print(len(sentence))
  return sentence, next_char

sentence, next_char = generate_split(cleaned_quotes,lookback = max_len, step = 10)

In [0]:
n_tokens = len(word_index)+1

x = np.zeros((n_samples, max_len, n_tokens))
y = np.zeros((n_samples, n_tokens))

for i,s in enumerate(sentence[:n_samples]):
  for t, char in enumerate(s):
      x[i, t, word_index[char]] = 1
  y[i, word_index[next_char[i]]] = 1
  
from keras.models import Sequential
from keras import Model
from keras.layers import Input, Embedding, LSTM, Dense, Flatten,Dropout, Conv1D,MaxPool1D
from keras.optimizers import RMSprop
from keras.models import load_model

model2 = Sequential()
model2.add(LSTM(128, input_shape=(max_len, n_tokens),return_sequences= True))
model2.add(Dropout(0.3))
model2.add(LSTM(128))
model2.add(Dropout(0.3))
model2.add(Dense(n_tokens,activation='softmax'))

In [0]:
optimizer = keras.optimizers.RMSprop(lr=0.01)
model2.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [0]:
callbacks_list = [
        keras.callbacks.ReduceLROnPlateau(
            monitor='loss',
            factor=0.1,
            patience=1,
        ),
        keras.callbacks.ModelCheckpoint(
            filepath=os.path.join('quotegen_test.h5'),
            monitor='loss',
            save_best_only=True,
        ),
]

In [0]:
model2.summary()

In [0]:
epochs = 10

history = model2.fit(x, y, batch_size=512, epochs=epochs, verbose =1, callbacks = callbacks_list)

In [0]:
def sample(preds, temperature= 0.2):
    preds = np.reshape(preds,preds.shape[-1])
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds+1e-25)/ temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1,preds, 1)
    return np.argmax(probas)

def inference(sent):
  string_length=20
  string_revised=sent.ljust(string_length)
  return string_revised

In [0]:
start_word="be"
inference_text = inference(start_word)

for i in range(5):
    sys.stdout.write(inference_text.strip()+" ")
    generated_text = inference_text[:20]+""
    for i in range(100):
      sampled = np.zeros((1, max_len, n_tokens))
      for t, char in enumerate(generated_text):
          sampled[0, t, word_index[char]] = 1.
      preds = model2.predict(sampled, verbose=0)[0]
      next_index = sample(preds, 0.5)
      next_char = index2char[next_index]
      if next_char =='\n':
        
        break
      generated_text += next_char
      generated_text = generated_text[1:]
      sys.stdout.write(next_char)
    print("\n-")

In [0]:
# download trained model
from google.colab import files
files.download('quotegen_test.h5') 

In [0]:
model = load_model(os.path.join('quotegen_test.h5'))

In [0]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))