In [1]:
#importing dependencies
import numpy
import sys
import nltk
nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
!wget https://www.gutenberg.org/files/84/84-0.txt

--2021-09-27 14:33:39--  https://www.gutenberg.org/files/84/84-0.txt
Resolving www.gutenberg.org (www.gutenberg.org)... 152.19.134.47, 2610:28:3090:3000:0:bad:cafe:47
Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 448821 (438K) [text/plain]
Saving to: ‘84-0.txt’


2021-09-27 14:34:16 (4.01 MB/s) - ‘84-0.txt’ saved [448821/448821]



In [11]:
#loading data 
# loading data and opening our input data in the form of a txt file
#project Gutenberg/berg is where the data can be found 
file = open("84-0.txt").read()

In [48]:
#tookenisation
#standardisation
def tokenize_words(input):
  # lowercase everything to a standardize it
  input = input.lower()
  # instantiating the tokenizer
  tokenizer = RegexpTokenizer(r'\w+')
  # tokenizing the text into tokens
  tokens = tokenizer.tokenize(input)
  # filtering the stopwords using lambda
  filtered = filter(lambda token: token not in stopwords.words('english'), tokens)
  return " ".join(filtered)

# preprocess the input data, make tokens 
processed_inputs = tokenize_words(file)

In [49]:
# chars to numbers 
chars = sorted(list(set(processed_inputs)))
char_to_num = dict((c,i) for i, c in enumerate(chars))

In [50]:
# Check if words to chars or chars  to num(?!) has worked?
input_len = len (processed_inputs)
vocab_len = len (chars)
print("Total number of characters:", input_len)
print("Total vocab:", vocab_len)


Total number of characters: 269878
Total vocab: 43


In [51]:
#seq length
seq_length = 100
x_data = []
y_data = []

In [52]:
#loop through the sequence 
for i in range(0, input_len - seq_length, 1):
  in_seq = processed_inputs[i:i + seq_length] 
  out_seq = processed_inputs[i + seq_length]
  x_data.append([char_to_num[char] for char in in_seq])
  y_data.append(char_to_num[out_seq])

n_patterns = len(x_data)
print("Total Patterns:", n_patterns)

Total Patterns: 269778


In [53]:
#convert input sequence to np array and so on
X = numpy.reshape(x_data, (n_patterns, seq_length, 1))
X = X/float(vocab_len)

In [54]:
#one hot-encoding
y = np_utils.to_categorical(y_data)

In [55]:
#creating the model
model= Sequential()
model.add(LSTM(256,input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

In [56]:
#compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [58]:
#saving weights
filepath = 'model_weights_saved.hdf5'
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose= 1, save_best_only=True, mode='min')
desired_callbacks = (checkpoint)

In [59]:
# fit model and let it train 
model.fit(X,y, epochs=4, batch_size=256, callbacks=desired_callbacks)

Epoch 1/4

Epoch 00001: loss improved from inf to 2.93051, saving model to model_weights_saved.hdf5
Epoch 2/4

Epoch 00002: loss improved from 2.93051 to 2.66161, saving model to model_weights_saved.hdf5
Epoch 3/4

Epoch 00003: loss improved from 2.66161 to 2.50990, saving model to model_weights_saved.hdf5
Epoch 4/4

Epoch 00004: loss improved from 2.50990 to 2.40641, saving model to model_weights_saved.hdf5


<keras.callbacks.History at 0x7f8f366c3050>

In [61]:
# recompile model with the saved weights
filename = 'model_weights_saved.hdf5'
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [62]:
# output of the model back into characters
num_to_char = dict((i,c) for i , c in enumerate(chars))

In [63]:
# random seed to help ganerate
start = numpy.random.randint(0,len(x_data)-1)
pattern = x_data[start]
print("Random Seed :")
print("\"",''.join([num_to_char[value] for value in pattern]),"\"")

Random Seed :
" tions 50 states united states compliance requirements uniform takes considerable effort much paperwo "


In [64]:
# generate the text
for i in range(1000):
  x = numpy.reshape(pattern, (1,len(pattern), 1))
  x = x/float(vocab_len)
  prediction = model.predict(x, verbose=0)
  index = numpy.argmax(prediction)
  result = num_to_char[index]
  seq_in = [num_to_char[value] for value in pattern]
  sys.stdout.write(result)
  pattern.append(index)
  pattern = pattern[ 1:len(pattern)]

e seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare se