### Import libraries

In [84]:
import tensorflow as tf
import numpy as np
import os
import matplotlib.pyplot as plt
import tqdm

### main function

In [136]:
def load_dataset(files, text = '', vocab = []):
     # Load text from files
     for file in files:
          text += open(file, 'rb').read().decode(encoding='UTF-8')
     # Create a sorted list of unique characters
     if len(vocab) == 0:
          vocab = sorted(set(text))
     else:
          # Add new characters to the existing vocabulary
          vocab = sorted(set(list(vocab) + list(text)))
                    
     print('Text length:', len(text), 'Unique characters:', len(vocab))
     return text, vocab

def split_input_target(chunk):
     # Split the text into input and target
     input_text = chunk[:-1]
     target_text = chunk[1:]
     return input_text, target_text

def save(path, model, vocab, history):
     # Save the model and the history
     model.save(path + '/model.h5')
     np.save(path + '/history.npy', history)
     np.save(path + '/vocab.npy', vocab)
     print('Model saved')

# Load the vocabulary
def loadModel(path):
     model = tf.keras.models.load_model(path + '/model.h5')
     vocab = np.load(path + '/vocab.npy', allow_pickle=True)
     history = np.load(path + '/history.npy', allow_pickle=True)
     return model, vocab, eval(str(history))

### Load Model & dataset (only if loading existing model)

In [138]:
model, vocab, history = loadModel("c:/Users/augus/OneDrive/Documents/code/AI/textGen/model")
text, vocab = load_dataset(['sonnet.txt'], text='', vocab=vocab)


batchSize = model.layers[0].input_shape[0]

# Create a mapping from characters to numbers and vice versa
ids_from_chars = tf.keras.layers.StringLookup(
     vocabulary = list(vocab), mask_token = None
)
chars_from_ids = tf.keras.layers.StringLookup(
     vocabulary = ids_from_chars.get_vocabulary(), invert = True, mask_token = None
)

Text length: 96902 Unique characters: 65


### Load Model & dataset (only if creating new model)

In [104]:
batchSize = 64
text, vocab = load_dataset(['sonnet.txt'])

# Create a mapping from characters to numbers and vice versa
ids_from_chars = tf.keras.layers.StringLookup(
     vocabulary = list(vocab), mask_token = None
)
chars_from_ids = tf.keras.layers.StringLookup(
     vocabulary = ids_from_chars.get_vocabulary(), invert = True, mask_token = None
)

# Build the model
model = tf.keras.Sequential([
     tf.keras.layers.Embedding(len(ids_from_chars.get_vocabulary()), 256, batch_input_shape=[batchSize, None]),
     tf.keras.layers.GRU(1024, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
     tf.keras.layers.Dense(len(ids_from_chars.get_vocabulary()))
])

# Compile the model
model.compile(
     optimizer='adam',
     loss=tf.losses.SparseCategoricalCrossentropy(from_logits = True),
     metrics=['accuracy']
)
model.summary()

history = {
     'loss': [],
     'accuracy': []
}


Text length: 96902 Unique characters: 65
Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (64, None, 256)           16896     
                                                                 
 gru_4 (GRU)                 (64, None, 1024)          3938304   
                                                                 
 dense_4 (Dense)             (64, None, 66)            67650     
                                                                 
Total params: 4,022,850
Trainable params: 4,022,850
Non-trainable params: 0
_________________________________________________________________


### Load dataset

In [139]:
seq_length = 100

# create sequences
sequences = tf.data.Dataset.from_tensor_slices(
     ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))
).batch(seq_length+1, drop_remainder=True)


# Create a dataset of sequences
dataset = (
     sequences.map(split_input_target)
     .shuffle(10000)
     .batch(batchSize, drop_remainder=True)
     .prefetch(tf.data.experimental.AUTOTUNE)
)

### Train & save Model

In [140]:
epochs = 1
savePath = "c:/Users/augus/OneDrive/Documents/code/AI/textGen/model2"

model.fit(dataset, epochs=epochs)

for i in range(len(model.history.history["loss"])):
     history["loss"].append(model.history.history["loss"][i])
     history["accuracy"].append(model.history.history["accuracy"][i])

# save the model
save(savePath, model, vocab, history)

# afficher l'evolution de l'apprentissage
plt.plot(history["accuracy"])
plt.legend(['train accuracy'], loc='lower left')
plt.title('accuracy')
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.ylim(0, 1)
# ligne au verticale au point de changement de dataset
plt.axvline(len(history["loss"]) - len(model.history.history["loss"]) - 1, color='r', linestyle='--')
plt.show()

# afficher l'evolution de la perte
plt.plot(history["loss"])
plt.legend(['train loss'], loc='upper left')
plt.title('loss')
plt.xlabel('epoch')
plt.ylabel('loss')
# ligne au verticale au point de changement de dataset
plt.axvline(len(history["loss"]) - len(model.history.history["loss"]) - 1, color='r', linestyle ='--')
plt.show()

 2/14 [===>..........................] - ETA: 55s - loss: 0.4597 - accuracy: 0.8989 

KeyboardInterrupt: 