In [2]:
import numpy as np
import io
import string
import re
import tensorflowjs as tfjs 
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model

from tensorflow.keras.layers import LSTM, TimeDistributed, Softmax, Flatten, Dense

In [6]:
##Load the corpus

with io.open("sks.txt", 'r', encoding="utf-8") as file:
    text = file.read()
    

In [7]:
## Only using the first 200000 characters

text = text[0:200000]


In [8]:
##Clean the corpus by removing basic stop characters

stop = '[&\[\]\<\>"():;\-,\\s+]'
corpus_tokens = re.split(stop, text.lower())
corpus_tokens = [c for c in corpus_tokens if len(c) > 0 ]
clean_text = " ".join(corpus_tokens)

In [9]:
##Construct the alphabet

alphabet = list(set([c for c in clean_text]))

In [10]:
##Biuld dictionaries char -> index and index -> char 

dictionary = dict([(c, i) for i,c in enumerate(alphabet)])
idx_to_char = dict([(i, c) for i,c in enumerate(alphabet)])


In [11]:
dict_size = len(dictionary)
size = len(clean_text)

In [12]:
## Helper function to make one-hot vectors

def charToOneHot(char, dictionary):
    vect = np.zeros((len(dictionary), 1))
    idx = dictionary[char]
    vect[idx] = 1
    return vect

In [13]:
## Number of total characters

tot_char = len(clean_text)

In [19]:
##Prepare data, it may take some time

X = []
y = []
n_step = 25 ## Number of steps for the LSTM layer to use during training

for i in range(tot_char - n_step - 1):
    chars = clean_text[i : i + n_step]
    vects = [charToOneHot(char, dictionary) for char in chars]
    X.append(vects)
    
for i in range(1, tot_char - n_step ):
    chars = clean_text[i : i + n_step ]
    vects = [charToOneHot(char, dictionary) for char in chars]
    
    y.append(vects)
    
X = np.array(X)
y = np.array(y)

X = X.reshape(X.shape[0], X.shape[1], X.shape[2])
y = y.reshape(y.shape[0], y.shape[1] * y.shape[2])

In [47]:
##Build the neural network model

model = Sequential()


model.add(LSTM(256, input_shape = (n_step, dict_size), return_sequences = True ))
model.add(LSTM(256, input_shape = (n_step, dict_size), return_sequences = True ))


model.add(TimeDistributed(Dense(dict_size)))
model.add(TimeDistributed(Softmax()))

model.add(Flatten())

In [36]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_5 (LSTM)                (None, 25, 256)           305152    
_________________________________________________________________
lstm_6 (LSTM)                (None, 25, 256)           525312    
_________________________________________________________________
time_distributed_6 (TimeDist (None, 25, 41)            10537     
_________________________________________________________________
time_distributed_7 (TimeDist (None, 25, 41)            0         
_________________________________________________________________
flatten_3 (Flatten)          (None, 1025)              0         
Total params: 841,001
Trainable params: 841,001
Non-trainable params: 0
_________________________________________________________________


In [50]:
## Compile the model

model.compile(optimizer = tf.keras.optimizers.SGD(learning_rate = 0.01), loss='categorical_crossentropy')

In [None]:
n_epochs = 150

temperature = 0.8 ## The higher the temperature, the less conservative the predictions

for epoch in range(n_epochs):
    print("Epoch: ", i + 1,"/",n_epochs)
    print("--------------------------------------")
    model.fit(X, y,  batch_size = 128, epochs = 1)
    
    ##We test what the networks generates
    
    word = 'i' * 25
    tot = ""
    for i in range(140):
        vects = np.array([[charToOneHot(char, dictionary) for char in word]])
        vects = vects.reshape(1, n_step, dict_size)
        y_preds = model.predict(vects)
        probs = y_preds.T[(n_step - 1) * dict_size: ]
        probs = np.array(probs) ** (1/temperature)
        sum_norm = np.sum(probs) 
        idx = np.random.choice(range(dict_size), p = np.squeeze(probs/sum_norm))
        new_char = idx_to_char[idx]
        tot = tot + new_char
        word = word[1:] + new_char
    model.save_weights("weights/rnn_weights-epoch-" + str(epoch) + ".h5")
    print(tot)

In [None]:
word = 'i' * 25
tot = ""
new_char= ''
temperature = 0.4
n_step = 25

while (new_char != "."):
        vects = np.array([[charToOneHot(char, dictionary) for char in word]])
        vects = vects.reshape(1, n_step, dict_size)
        y_preds = model.predict(vects)
        probs = y_preds.T[(n_step - 1) * dict_size: ]
        probs = np.array(probs) ** (1/temperature)
        sum_norm = np.sum(probs) 
        idx = np.random.choice(range(dict_size), p = np.squeeze(probs/sum_norm))
        new_char = idx_to_char[idx]
        tot = tot + new_char
        print(tot)
        word = word[1:] + new_char
string_to_print = []
for token in tot.split():
    string_to_print.append(token)
    if (np.random.rand() < 0.25):
        string_to_print.append("\n")
output = " ".join(string_to_print[1:])
output = output[:-1]
print(output)