# Neural Language Model of English

- [How to development a word-level neural language model in keras](https://machinelearningmastery.com/how-to-develop-a-word-level-neural-language-model-in-keras/)
- English texts
- Word-based neural language model based on word sequences of 50 words
- Use the republic texts

In [1]:
import string


# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text


# turn a doc into clean tokens
def clean_doc(doc):
    # replace '--' with a space ' '
    doc = doc.replace('--', ' ')
    # split into tokens by white space
    tokens = doc.split()
    # remove punctuation from each token
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # make lower case
    tokens = [word.lower() for word in tokens]
    return tokens


# save tokens to file, one dialog per line
def save_doc(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()


# load document
in_filename = 'republic_clean.txt'
doc = load_doc(in_filename)
print(doc[:200])

# clean document
tokens = clean_doc(doc)
print(tokens[:200])
print('Total Tokens: %d' % len(tokens))
print('Unique Tokens: %d' % len(set(tokens)))

# organize into sequences of tokens
length = 50 + 1
sequences = list()
for i in range(length, len(tokens)):
    # select sequence of tokens
    seq = tokens[i - length:i]
    # convert into a line
    line = ' '.join(seq)
    # store
    sequences.append(line)
print('Total Sequences: %d' % len(sequences))

# save sequences to file
out_filename = 'republic_sequences.txt'
save_doc(sequences, out_filename)

The Project Gutenberg EBook of The Republic, by Plato

This eBook is for the use of anyone anywhere at no cost and with
almost no restrictions whatsoever.  You may copy it, give it away or
re-use it u
['the', 'project', 'gutenberg', 'ebook', 'of', 'the', 'republic', 'by', 'plato', 'this', 'ebook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'at', 'no', 'cost', 'and', 'with', 'almost', 'no', 'restrictions', 'whatsoever', 'you', 'may', 'copy', 'it', 'give', 'it', 'away', 'or', 'reuse', 'it', 'under', 'the', 'terms', 'of', 'the', 'project', 'gutenberg', 'license', 'included', 'with', 'this', 'ebook', 'or', 'online', 'at', 'wwwgutenbergorg', 'title', 'the', 'republic', 'author', 'plato', 'translator', 'b', 'jowett', 'posting', 'date', 'august', 'ebook', 'release', 'date', 'october', 'last', 'updated', 'june', 'language', 'english', 'start', 'of', 'this', 'project', 'gutenberg', 'ebook', 'the', 'republic', 'produced', 'by', 'sue', 'asscher', 'the', 'republic', 'by', 'plato', 'tra

Total Sequences: 216740


In [2]:
from numpy import array
from pickle import dump
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# load
in_filename = 'republic_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')

In [3]:
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)
# vocabulary size
vocab_size = len(tokenizer.word_index) + 1

# separate into input and output
sequences = array(sequences)
X, y = sequences[:, :-1], sequences[:, -1]
y = to_categorical(y, num_classes=vocab_size)  # one-hot encode y
seq_length = X.shape[1]

# define model
model = Sequential()
model.add(Embedding(vocab_size, 50,
                    input_length=seq_length))  # word ebmedding layer
model.add(LSTM(100, return_sequences=True))  # LSTM 1
model.add(LSTM(100))  # LSTM 2
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())
# compile model
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
# fit model
model.fit(X, y, batch_size=512, epochs=1)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 50, 50)            522750    
_________________________________________________________________
lstm (LSTM)                  (None, 50, 100)           60400     
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense (Dense)                (None, 100)               10100     
_________________________________________________________________
dense_1 (Dense)              (None, 10455)             1055955   
Total params: 1,729,605
Trainable params: 1,729,605
Non-trainable params: 0
_________________________________________________________________
None


  1/424 [..............................] - ETA: 0s - loss: 9.2548 - accuracy: 0.0020

  2/424 [..............................] - ETA: 1:45 - loss: 9.2541 - accuracy: 0.0439

  3/424 [..............................] - ETA: 2:25 - loss: 9.2534 - accuracy: 0.0566

  4/424 [..............................] - ETA: 2:44 - loss: 9.2524 - accuracy: 0.0645

  5/424 [..............................] - ETA: 2:53 - loss: 9.2513 - accuracy: 0.0656

  6/424 [..............................] - ETA: 3:00 - loss: 9.2499 - accuracy: 0.0680

  7/424 [..............................] - ETA: 3:05 - loss: 9.2479 - accuracy: 0.0698

  8/424 [..............................] - ETA: 3:10 - loss: 9.2448 - accuracy: 0.0703

  9/424 [..............................] - ETA: 3:12 - loss: 9.2400 - accuracy: 0.0677

 10/424 [..............................] - ETA: 3:14 - loss: 9.2319 - accuracy: 0.0674

 11/424 [..............................] - ETA: 3:15 - loss: 9.2185 - accuracy: 0.0673

 12/424 [..............................] - ETA: 3:17 - loss: 9.1953 - accuracy: 0.0690

 13/424 [..............................] - ETA: 3:17 - loss: 9.1645 - accuracy: 0.0702

 14/424 [..............................] - ETA: 3:18 - loss: 9.1269 - accuracy: 0.0709

 15/424 [>.............................] - ETA: 3:19 - loss: 9.0844 - accuracy: 0.0714

 16/424 [>.............................] - ETA: 3:20 - loss: 9.0357 - accuracy: 0.0719

 17/424 [>.............................] - ETA: 3:20 - loss: 8.9812 - accuracy: 0.0715

 18/424 [>.............................] - ETA: 3:20 - loss: 8.9199 - accuracy: 0.0710

 19/424 [>.............................] - ETA: 3:21 - loss: 8.8515 - accuracy: 0.0709

 20/424 [>.............................] - ETA: 3:21 - loss: 8.7762 - accuracy: 0.0712

 21/424 [>.............................] - ETA: 3:21 - loss: 8.7021 - accuracy: 0.0698

 22/424 [>.............................] - ETA: 3:21 - loss: 8.6161 - accuracy: 0.0692

 23/424 [>.............................] - ETA: 3:22 - loss: 8.5396 - accuracy: 0.0690

 24/424 [>.............................] - ETA: 3:21 - loss: 8.4634 - accuracy: 0.0686

In [None]:
# save the model to file
model.save('model.h5')
# save the tokenizer
dump(tokenizer, open('tokenizer.pkl', 'wb'))

In [None]:
from random import randint
from pickle import load
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences


# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text


# generate a sequence from a language model
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
    result = list()
    in_text = seed_text
    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        # truncate sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        # predict probabilities for each word
        yhat = model.predict_classes(encoded, verbose=0)
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        # append to input
        in_text += ' ' + out_word
        result.append(out_word)
    return ' '.join(result)


# load cleaned text sequences
in_filename = 'republic_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')
seq_length = len(lines[0].split()) - 1

# load the model
model = load_model('model.h5')

# load the tokenizer
tokenizer = load(open('tokenizer.pkl', 'rb'))

# select a seed text
seed_text = lines[randint(0, len(lines))]
print(seed_text + '\n')

# generate new text
generated = generate_seq(model, tokenizer, seq_length, seed_text, 50)
print(generated)