<a href="https://colab.research.google.com/github/annamaartensson/dd2424project/blob/models/models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
import numpy as np
import pathlib
import os
import platform

print(platform.python_version())
print(tf.__version__)

3.10.12
2.15.0


In [None]:
def fetch_data():
  cache_dir = './tmp'
  dataset_file_name = 'pg31100.txt'
  dataset_file_origin = 'https://www.gutenberg.org/cache/epub/31100/pg31100.txt'
  dataset_file_path = tf.keras.utils.get_file(fname = dataset_file_name, origin = dataset_file_origin, cache_dir=pathlib.Path(cache_dir).absolute())
  text = open(dataset_file_path, mode='r').read()
  persuasion = text[1437:468297]
  northanger_abbey = text[468297:901707]
  mansfield_park = text[901707:1784972]
  emma = text[1784972:2668012]
  lady_susan = text[2668012:2795312]
  love_and_friendship = text[2795312:2980261]
  pride_and_predjudice = text[2980261:3665048]
  sense_and_sensibility = text[3682008:4355100]
  full_text = text[1437:4355100]
  books = [persuasion, northanger_abbey, mansfield_park, emma, lady_susan, love_and_friendship, pride_and_predjudice, sense_and_sensibility]
  return books

In [None]:
def get_vocabulary(text):
  vocabulary = sorted(set(text))
  char_to_ind = tf.keras.layers.StringLookup(vocabulary = list(vocabulary), mask_token = None)
  ind_to_char = tf.keras.layers.StringLookup(vocabulary = char_to_ind.get_vocabulary(), invert = True, mask_token = None)
  return vocabulary, char_to_ind, ind_to_char

In [None]:
def batch_data(text, seq_length, batch_size, buffer_size, char_to_ind):
  dataset = tf.data.Dataset.from_tensor_slices(char_to_ind(tf.strings.unicode_split(text, 'UTF-8')))
  sequences = dataset.batch(seq_length+1, drop_remainder = True).map(lambda s : (s[:seq_length], s[1:]))
  batches = sequences.shuffle(buffer_size).batch(batch_size, drop_remainder=True).prefetch(tf.data.experimental.AUTOTUNE)
  return batches

In [None]:
class RNN(tf.keras.Model):
  def __init__(self, K, m):
    super().__init__(self)
    self.onehot = tf.keras.layers.Embedding(K, K, embeddings_initializer = 'identity', trainable = False)
    self.rnn = tf.keras.layers.SimpleRNN(m, return_sequences = True, return_state = True)
    self.dense = tf.keras.layers.Dense(K)

  def call(self, inputs, states = None, return_state = False, training = False):
    x = inputs
    x = self.onehot(x, training = training)
    if states is None:
      states = self.rnn.get_initial_state(x)
    x, states = self.rnn(x, initial_state = states, training = training)
    x = self.dense(x, training = training)
    if return_state:
      return x, states
    else:
      return x

class LSTM(tf.keras.Model):
  def __init__(self, K, m):
    super().__init__(self)
    self.onehot = tf.keras.layers.Embedding(K, K, embeddings_initializer = 'identity', trainable = False)
    self.lstm = tf.keras.layers.LSTM(m, return_sequences = True, return_state = True)
    self.dense = tf.keras.layers.Dense(K)

  def call(self, inputs, states = None, return_state = False, training = False):
    x = inputs
    x = self.onehot(x, training = training)
    if states is None:
      states = self.lstm.get_initial_state(x)
    x, *states = self.lstm(x, initial_state = states, training = training)
    x = self.dense(x, training = training)
    if return_state:
      return x, states
    else:
      return x

class LSTM2(tf.keras.Model):
  def __init__(self, K, m):
    super().__init__(self)
    self.onehot = tf.keras.layers.Embedding(K, K, embeddings_initializer = 'identity', trainable = False)
    self.lstm1 = tf.keras.layers.LSTM(m, return_sequences = True, return_state = True)
    self.lstm2 = tf.keras.layers.LSTM(m, return_sequences = True, return_state = True)
    self.dense = tf.keras.layers.Dense(K)

  def call(self, inputs, states = None, return_state = False, training = False):
    x = inputs
    x = self.onehot(x, training = training)
    if states is None:
      states_1 = self.lstm1.get_initial_state(x)
      states_2 = states_1
    else:
      states_1 = states[0]
      states_2 = states[1]
    x, *states_1 = self.lstm1(x, initial_state = states_1, training = training)
    x, *states_2 = self.lstm2(x, initial_state = states_2, training = training)
    x = self.dense(x, training = training)
    if return_state:
      return x, [states_1, states_2]
    else:
      return x

In [None]:
def generate_text_temperature(start, length, model, char_to_ind, ind_to_char, T = 1.0):
  text = []
  states = None
  for i in range(length):
    chars = tf.strings.unicode_split(start, 'UTF-8')
    logits, states = model(inputs = char_to_ind(chars).to_tensor(), states = states, return_state = True)
    logits = logits[:, -1, :]/T
    unk_inds = char_to_ind(['[UNK]'])[:, None]
    sparse_unk_mask = tf.SparseTensor(values = [-float('inf')]*len(unk_inds), indices = unk_inds, dense_shape=[len(char_to_ind.get_vocabulary())])
    logits = logits + tf.sparse.to_dense(sparse_unk_mask)
    pred = tf.random.categorical(logits, num_samples = 1)
    start = ind_to_char(tf.squeeze(pred, axis = -1))
    text.append(start[0].numpy().decode('utf-8'))
  return tf.strings.reduce_join(text, axis = -1).numpy().decode('utf-8')

def generate_text_nucleus(start, length, model, char_to_ind, ind_to_char, theta = 1.0):
  text = []
  states = None
  for i in range(length):
    chars = tf.strings.unicode_split(start, 'UTF-8')
    logits, states = model(inputs = char_to_ind(chars).to_tensor(), states = states, return_state = True)
    logits = logits[:, -1, :]
    unk_inds = char_to_ind(['[UNK]'])[:, None]
    sparse_unk_mask = tf.SparseTensor(values = [-float('inf')]*len(unk_inds), indices = unk_inds, dense_shape=[len(char_to_ind.get_vocabulary())])
    logits = logits + tf.sparse.to_dense(sparse_unk_mask)
    logits = tf.squeeze(logits, axis = 0)
    probs = tf.nn.softmax(logits)
    sorted_probs = tf.sort(probs, direction = 'DESCENDING')
    sorted_probs_sum = tf.math.cumsum(sorted_probs)
    thresh_inds = tf.where(sorted_probs_sum <= theta)
    if len(thresh_inds) > 0:
      thresh_ind = thresh_inds[-1, 0].numpy()
    else:
      thresh_ind = 0
    top_probs = tf.multiply(probs, tf.cast(probs >= sorted_probs[thresh_ind], 'float32'))/sorted_probs_sum[thresh_ind]
    pred = tf.random.categorical([tf.math.log(top_probs)], num_samples = 1)
    start = ind_to_char(tf.squeeze(pred, axis = -1))
    text.append(start[0].numpy().decode('utf-8'))
  return tf.strings.reduce_join(text, axis = -1).numpy().decode('utf-8')

In [None]:
books = fetch_data()

training_text = books[0]# + books[1] + books[2] + books[3]
validation_text = books[6]
test_text = books[7]

vocabulary, char_to_ind, ind_to_char = get_vocabulary(training_text)

In [None]:
seq_length = 100
batch_size = 64
buffer_size = 10000

training_batches = batch_data(training_text, seq_length, batch_size, buffer_size, char_to_ind)

In [None]:
K = len(char_to_ind.get_vocabulary())
m = 256

model = LSTM2(K = K, m = m)
model.compile(optimizer = tf.optimizers.Adam(learning_rate = 0.001), loss = tf.losses.SparseCategoricalCrossentropy(from_logits = True))

checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath = checkpoint_prefix, save_weights_only = True)

history = model.fit(training_batches, epochs = 40, callbacks = [checkpoint_callback])

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [None]:
start = tf.constant(['Elizabeth'])
length = 500
T = 1.0

print(generate_text_temperature(start, length, model, char_to_ind, ind_to_char, T))

, grod it.  He mested to he mamay, he
reaghed, price.  The word manirely; I do now so quite, to her passing a, and Adne.  How not a kendures of still
the obistors, our over cousin of Kellymins Harvizabe wished in the must be-diefference in to decess in engermany from "and imetimed, and reporting at horse it
presention
before me and himself than very gracement, in mothing.  Her distous dealiced; I will fatter to be dividewatly; and Lady your sister-nor his nifity him and evinity while,"
said Mrss
