<a href="https://colab.research.google.com/github/antahiap/dsr-nlp/blob/main/notebooks/04_language_generation_lsdyna.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import glob
import random
import shutil
import tensorflow as tf
from tensorflow.keras import preprocessing
from tensorflow.keras import models, layers
from tqdm import tqdm
import time

In [6]:
# Where the text files are going to live.
dataset_path = "dataset"
dataset_path_all = os.path.join(dataset_path, "all")
dataset_path_train = os.path.join(dataset_path, "train")
dataset_path_valid = os.path.join(dataset_path, "valid")

# Just use 20 files.
file_number = 20

# Gather the corpus if it has not been gathered yet.
if not os.path.exists(dataset_path):

    # Create all the folders.
    for path in [dataset_path, dataset_path_all, dataset_path_train, dataset_path_valid]:
        if not os.path.exists(path):
            os.mkdir(path)

    # Clone the repo.
    !git clone https://github.com/antahiap/dsr-nlp #https://github.com/vilmibm/lovecraftcorpus

    # Find all the files.
    paths_all = glob.glob("dsr-nlp/data/mat/*.txt")
    print(sorted(paths_all))

    # Do not use all.
    paths_all = paths_all[:file_number]

    # Split 80/20.
    split_index = int(len(paths_all) * 0.8)
    paths_train = paths_all[:split_index]
    paths_valid = paths_all[split_index:]

    # Copy files.
    def copy(paths, destination):
        for path in paths:
            shutil.copy2(path, destination)
    copy(paths_all, dataset_path_all)
    copy(paths_train, dataset_path_train)
    copy(paths_valid, dataset_path_valid)

    # Delete repo.
    !rm -rf dsr-nlp

    # Done.
    print("Corpus downloaded.")

# Data Setup

In [3]:
def create_dataset(data_path):
  dataset = preprocessing.text_dataset_from_directory(
      data_path,
      labels=None,
      batch_size=32,  # not training
      seed=42
  )
  return dataset

In [4]:
dataset_original_all = create_dataset(dataset_path_all)
dataset_original_train = create_dataset(dataset_path_train)
dataset_original_valid = create_dataset(dataset_path_valid)

Found 0 files belonging to 1 classes.


ValueError: ignored

In [None]:
for batch in dataset_original_all:
  for sample in batch:
    print(sample)

tf.Tensor(b'FACTS CONCERNING THE LATE ARTHUR JERMYN AND HIS FAMILY\n\nI\n\nLife is a hideous thing, and from the background behind what we know of it peer daemoniacal hints of truth which make it sometimes a thousandfold more hideous. Science, already oppressive with its shocking revelations, will perhaps be the ultimate exterminator of our human species--if separate species we be--for its reserve of unguessed horrors could never be borne by mortal brains if loosed upon the world. If we knew what we are, we should do as Sir Arthur Jermyn did; and Arthur Jermyn soaked himself in oil and set fire to his clothing one night. No one placed the charred fragments in an urn or set a memorial to him who had been; for certain papers and a certain boxed object were found which made men wish to forget. Some who knew him do not admit that he ever existed.\n\nArthur Jermyn went out on the moor and burned himself after seeing the boxed object which had come from Africa. It was this object, and not hi

In [None]:
vocabulary_size = 10_000      # state of art: 50_000

encoder = layers.TextVectorization(
    max_tokens=vocabulary_size,
    standardize=None,  # REconsider, keep puctuations and cappitalize
    split='whitespace',
    output_mode='int'
)

encoder.adapt(dataset_original_all)

In [None]:
vocabulary = encoder.get_vocabulary()
print(vocabulary[:100])
print(vocabulary[100:])

['', '[UNK]', 'the', 'of', 'and', 'to', 'a', 'in', 'I', 'that', 'was', 'had', 'he', 'with', 'as', 'which', 'his', 'my', 'were', 'from', 'for', 'not', 'on', 'it', 'at', 'but', 'by', 'The', 'they', 'be', 'or', 'could', 'all', 'their', 'have', 'no', 'one', 'an', 'would', 'him', 'Carter', 'this', 'when', 'its', 'some', 'me', 'been', 'so', 'is', 'there', 'through', 'into', 'It', 'those', 'what', 'did', 'He', 'only', 'out', 'more', 'old', 'great', 'saw', 'seemed', 'than', 'where', 'very', 'now', 'about', 'whose', 'up', 'over', 'who', 'them', 'other', 'even', 'might', 'down', 'must', 'time', 'any', 'such', 'are', 'before', 'after', 'And', 'these', 'found', 'upon', 'knew', 'though', 'we', 'came', 'things', 'In', 'strange', 'still', 'black', 'There', 'made']


# Dataset for Autoregression

In [None]:
sequence_length = 32      # state of art: 4000
padding_token_id = 0

def create_dataset_for_autoregression(dataset):
  x_inputs = []
  y_outputs = []

  for books in dataset:
    # print(books.shape)  #(16, 1)
    books = encoder(books).numpy()
    # print(list(books[0][-100:]))   #  many zeros due to the padding

    for book in tqdm(books):

      # Remove the pddding from the end
      book = [x for x in list(book) if x!=padding_token_id]

      # Add üadding at the begining
      padding = [padding_token_id] * sequence_length
      book = padding + book

      # Create inputs and outputs
      for start_index in range(0, len(book)-sequence_length):
        x = book[start_index:start_index + sequence_length]
        assert len(x) == sequence_length
        y = book[start_index + sequence_length]

        x_inputs += [x]
        y_outputs += [y]


  return tf.data.Dataset.from_tensor_slices((x_inputs, y_outputs))

dataset_train = create_dataset_for_autoregression(dataset_original_train)
dataset_valid = create_dataset_for_autoregression(dataset_original_valid)
print('Done!')

100%|██████████| 16/16 [00:00<00:00, 24.52it/s]
100%|██████████| 4/4 [00:00<00:00, 151.61it/s]


Done!


In [None]:
def decode(indices):
  return ''.join([vocabulary[index] for index in indices])

In [None]:
for input, output in dataset_train.take(4):
  print('input: ', ', '.join([str(x) for x in input.numpy()]))
  print('output:', output.numpy())

  print('input decoded: ', decode(input))
  print('output decoded: ', decode([output]))

input:  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
output: 735
input decoded:  
output decoded:  THE
input:  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 735
output: 1
input decoded:  THE
output decoded:  [UNK]
input:  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 735, 1
output: 1
input decoded:  THE[UNK]
output decoded:  [UNK]
input:  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 735, 1, 1
output: 1
input decoded:  THE[UNK][UNK]
output decoded:  [UNK]


In [None]:
def render_history(history):
    plt.title("Training loss vs. validation loss")
    plt.plot(history.history["loss"], label="loss")
    plt.plot(history.history["val_loss"], label="val_loss")
    plt.legend()
    plt.show()
    plt.close()

    plt.title("Training accuracy vs. validation accuracy")
    plt.plot(history.history["accuracy"], label="accuracy")
    plt.plot(history.history["val_accuracy"], label="val_accuracy")
    plt.legend()
    plt.show()
    plt.close()

# Train LSTM

In [None]:
embedding_size = 128
model = models.Sequential()
model.add(layers.Embedding(vocabulary_size, embedding_size, input_length=sequence_length))  # index to vec size of 128
model.add(layers.LSTM(256))
model.add(layers.Dense(vocabulary_size, activation='softmax'))


model.summary()


model.compile(
    optimizer = 'adam',
    loss = 'sparse_categorical_crossentropy',
    metrics=['accuracy']
)

history = model.fit(
    dataset_train.cache().shuffle(1_000_000).batch(1024),    # batch is big to make in run quicker
    epochs=10,
    validation_data = dataset_valid.cache().batch(1024)
)


Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 32, 128)           1280000   
                                                                 
 lstm_2 (LSTM)               (None, 256)               394240    
                                                                 
 dense_2 (Dense)             (None, 10000)             2570000   
                                                                 
Total params: 4,244,240
Trainable params: 4,244,240
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10

In [None]:
import matplotlib.pyplot as plt

model.save("model/model.h5")
render_history(history.history)

In [None]:
import numpy as np

def generate(model, seed_text, generated_sequence_length, temperature):

    input_sequence = encoder(seed_text).numpy()

    generated_sequence = list(input_sequence[::])

    # Pad.
    padding = [0] * (sequence_length - len(input_sequence))
    input_sequence = padding + list(input_sequence)

    # Generate the sequence by repeatedly predicting.
    while len(generated_sequence) < generated_sequence_length:
        prediction = model.predict(np.expand_dims(input_sequence, axis=0), verbose=0)
        predicted_index = get_index_from_prediction(prediction[0], temperature)
        generated_sequence.append(predicted_index)
        input_sequence = input_sequence[1:]
        input_sequence.append(predicted_index)

    # Convert the generated sequence to a string.
    text = decode(generated_sequence)
    print(text)
    print("")


def get_index_from_prediction(prediction, temperature=0.0):
    """ Gets an index from a prediction. """
    # temperture extend the range of the norm, from deufult to uniform distribution

    # Zero temperature - use the argmax.
    if temperature == 0.0:
        return np.argmax(prediction)

    # Non-zero temperature - do some random magic.
    else:
        prediction = np.asarray(prediction).astype('float64')
        prediction = np.log(prediction) / temperature

        # Softmax
        exp_prediction= np.exp(prediction)
        prediction = exp_prediction / np.sum(exp_prediction) # Prob distribution

        probabilities = np.random.multinomial(1, prediction, 1)
        return np.argmax(probabilities)


generate(model, "we are all doomed", 100, temperature=1.0)