## Generative Text Model using LSTM
## Chengyuan Zhou

In [1]:
import string
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, LSTM
import os

In [2]:
#Read files and concat them into one corpus
filenames = ['TPP.txt', 'TAM.txt', 'MLOE.txt', 'OKEWFSMP.txt'] 
remove = dict.fromkeys(map(ord, string.punctuation), ' ')
with open('corpus.txt', 'w') as outfile:
    for fname in filenames:
        #translate will remove the punctuation
        #lower will convert all upper case letter to lower case'
        #split and join will remove all extra spaces, left only 1 space between words
        #so corpusFile will only contains lower case letter
        content = open(fname, 'r', encoding = 'ascii', errors='ignore').read()
        processed_content = ' '.join(content.translate(remove).lower().split())
        outfile.write(processed_content)

#store the file as different type
corpus_file = open('corpus.txt', 'r')
corpus_text = corpus_file.read()
corpus_set = set(corpus_text)

In [3]:
#the original char-code(index) pair
corpus_char_code_dict = dict()
corpus_code_char_dict = dict()
for index, char in enumerate(sorted(corpus_set)):
    corpus_char_code_dict[char] = index
    corpus_code_char_dict[index] = char

In [4]:
#scaled[0,1] char-ascii code pair
corpus_char_scaled_dict = dict()
corpus_scaled_char_dict = dict()
ascii_code_array = np.array(list(corpus_char_code_dict.values())).reshape(-1,1)
scaled_value_list = MinMaxScaler().fit_transform(ascii_code_array)
for i in range(len(corpus_char_code_dict)):
    key = list(corpus_char_code_dict.keys())[i]
    scaled_value = scaled_value_list[i][0]
    corpus_char_scaled_dict[key] = scaled_value
    corpus_scaled_char_dict[scaled_value] = key

In [5]:
# making x data and y data
window_size = 100 #use first 99 characters in this window to predict the 100th character
x_data = []
y_data = []
for i in range(len(corpus_text) - window_size): 
    x_char = corpus_text[i:i+window_size-1]  #first 99 characters in this window
    y_char = corpus_text[i+window_size-1]  #the 100th character of this window
    x_data.append([corpus_char_scaled_dict[x] for x in x_char]) # 99 scaled char as x
    y_data.append(corpus_char_code_dict[y_char])  # last char as y

## Reference
- https://stackoverflow.com/questions/61550026/valueerror-shapes-none-1-and-none-3-are-incompatible

In [6]:
x_data = np.array(x_data)
x_data = np.reshape(x_data, (x_data.shape[0], x_data.shape[1], 1))
y_data = keras.utils.to_categorical(y_data, num_classes=len(corpus_char_code_dict))
print(x_data.shape)
print(y_data.shape)

(1530941, 99, 1)
(1530941, 37)


In [8]:
lstm = keras.models.Sequential()
lstm.add(keras.layers.LSTM(units=256, input_shape=(x_data.shape[1], x_data.shape[2])))
lstm.add(keras.layers.Dense(y_data.shape[1], activation="softmax"))
lstm.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
lstm.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 256)               264192    
_________________________________________________________________
dense (Dense)                (None, 37)                9509      
Total params: 273,701
Trainable params: 273,701
Non-trainable params: 0
_________________________________________________________________


## Reference
#### 1. Model configuration
- https://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/
- https://machinelearningmastery.com/reshape-input-data-long-short-term-memory-networks-keras/
- https://keras.io/api/losses/probabilistic_losses/#categoricalcrossentropy-class

#### 2. Weights keeping using checkpoints
- https://www.tensorflow.org/tutorials/keras/save_and_load
- https://machinelearningmastery.com/check-point-deep-learning-models-keras/

In [9]:
checkpoint_path = "LSTM_checkpoints/cp.ckpt"

# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path, 
                                                 save_weights_only=True, 
                                                 verbose=0,
                                                 monitor='loss',
                                                 mode='min')

lstm.fit(x_data, y_data, epochs=30, callbacks=[cp_callback])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7fd7f4063630>

In [7]:
min_loss_cp = "LSTM_checkpoints/cp.ckpt"

min_loss_lstm = keras.models.Sequential()
min_loss_lstm.add(keras.layers.LSTM(units=256, input_shape=(window_size-1, 1)))
min_loss_lstm.add(keras.layers.Dense(y_data.shape[1], activation="softmax"))
min_loss_lstm.load_weights(min_loss_cp)
min_loss_lstm.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [20]:
init_text = 'There are those who take mental phenomena naively, just as they would physical phenomena. This school of psychologists tends not to emphasize the object.'

remove = dict.fromkeys(map(ord, string.punctuation), ' ')
processed_text = ' '.join(init_text.translate(remove).lower().split())
print(processed_text)

there are those who take mental phenomena naively just as they would physical phenomena this school of psychologists tends not to emphasize the object


In [47]:
test_input = processed_text[-99:]
test_input_array = np.array([corpus_char_scaled_dict[x] for x in test_input])
result = processed_text

for i in range(1000):
    data = np.reshape(test_input_array, (1, test_input_array.shape[0], 1)) #reshape the feed-in data
    pred = min_loss_lstm.predict(data) #make predication
    charIndex = np.argmax(pred) #find the index of output with max prob
    pred_char = corpus_code_char_dict[charIndex] #translate the output to char
    result += pred_char #append the predicted char to result

    #append the previous pred_char to input text and get a new test_input_array
    test_input += pred_char
    test_input = test_input[-99:]
    test_input_array = np.array([corpus_char_scaled_dict[x] for x in test_input])
    
print(result)

there are those who take mental phenomena naively just as they would physical phenomena this school of psychologists tends not to emphasize the object which is the same as the shnple in the state of affairs in the state of affairs in the state of affairs in the state of affairs in the state of affairs in the state of affairs in the state of affairs in the state of affairs in the state of affairs in the state of affairs in the state of affairs in the state of affairs in the state of affairs in the state of affairs in the state of affairs in the state of affairs in the state of affairs in the state of affairs in the state of affairs in the state of affairs in the state of affairs in the state of affairs in the state of affairs in the state of affairs in the state of affairs in the state of affairs in the state of affairs in the state of affairs in the state of affairs in the state of affairs in the state of affairs in the state of affairs in the state of affairs in the state of affairs i