<a href="https://colab.research.google.com/github/annamaartensson/dd2424project/blob/issue%2F12/models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [31]:
import tensorflow as tf
import numpy as np
import pathlib
import os
import platform
import re

print(platform.python_version())
print(tf.__version__)

3.10.12
2.15.0


In [32]:
def fetch_data():
  cache_dir = './tmp'
  dataset_file_name = 'pg31100.txt'
  dataset_file_origin = 'https://www.gutenberg.org/cache/epub/31100/pg31100.txt'
  dataset_file_path = tf.keras.utils.get_file(fname = dataset_file_name, origin = dataset_file_origin, cache_dir=pathlib.Path(cache_dir).absolute())
  text = open(dataset_file_path, mode='r').read()
  persuasion = text[1437:468297]
  northanger_abbey = text[468297:901707]
  mansfield_park = text[901707:1784972]
  emma = text[1784972:2668012]
  lady_susan = text[2668012:2795312]
  love_and_friendship = text[2795312:2980261]
  pride_and_predjudice = text[2980261:3665048]
  sense_and_sensibility = text[3682008:4355100]
  full_text = text[1437:4355100]
  books = [persuasion, northanger_abbey, mansfield_park, emma, lady_susan, love_and_friendship, pride_and_predjudice, sense_and_sensibility]
  return books

In [33]:
def get_vocabulary(text):
  vocabulary = sorted(set(text))
  char_to_ind = tf.keras.layers.StringLookup(vocabulary = list(vocabulary), mask_token = None)
  ind_to_char = tf.keras.layers.StringLookup(vocabulary = char_to_ind.get_vocabulary(), invert = True, mask_token = None)
  return vocabulary, char_to_ind, ind_to_char

In [51]:
def batch_data(text, seq_length, char_to_ind, batch_size = 1, buffer_size = 0):
  dataset = tf.data.Dataset.from_tensor_slices(char_to_ind(tf.strings.unicode_split(text, 'UTF-8')))
  sequences = dataset.batch(seq_length+1, drop_remainder = True).map(lambda s : (s[:seq_length], s[1:]))
  if buffer_size > 0:
    sequences = sequences.shuffle(buffer_size)
  batches = sequences.batch(batch_size, drop_remainder = True).prefetch(tf.data.experimental.AUTOTUNE)
  return batches

In [35]:
def clean_text(text):
  lower = text.lower()
  no_spec = re.sub("\&|\[|\]|\_|!|\?|\*|\.|,|\(|\)|;|:|[0-9]+|\"|\'","", lower)
  no_enter = re.sub("\n|-"," ", no_spec)
  return no_enter.split()

In [36]:
def get_dictionary(text):
  dictionary = {w for w in clean_text(text)}
  return dictionary

In [37]:
def correctly_spelled(text, dictionary):
  count = 0
  words = clean_text(text)
  for w in clean_text(text):
    if w in dictionary:
      count += 1
  return count/len(words)

In [38]:
class RNN(tf.keras.Model):
  def __init__(self, K, m):
    super().__init__(self)
    self.onehot = tf.keras.layers.Embedding(K, K, embeddings_initializer = 'identity', trainable = False)
    self.rnn = tf.keras.layers.SimpleRNN(m, return_sequences = True, return_state = True)
    self.dense = tf.keras.layers.Dense(K)

  def call(self, inputs, states = None, return_state = False, training = False):
    x = inputs
    x = self.onehot(x, training = training)
    if states is None:
      states = self.rnn.get_initial_state(x)
    x, states = self.rnn(x, initial_state = states, training = training)
    x = self.dense(x, training = training)
    if return_state:
      return x, states
    else:
      return x

class LSTM(tf.keras.Model):
  def __init__(self, K, m):
    super().__init__(self)
    self.onehot = tf.keras.layers.Embedding(K, K, embeddings_initializer = 'identity', trainable = False)
    self.lstm = tf.keras.layers.LSTM(m, return_sequences = True, return_state = True)
    self.dense = tf.keras.layers.Dense(K)

  def call(self, inputs, states = None, return_state = False, training = False):
    x = inputs
    x = self.onehot(x, training = training)
    if states is None:
      states = self.lstm.get_initial_state(x)
    x, *states = self.lstm(x, initial_state = states, training = training)
    x = self.dense(x, training = training)
    if return_state:
      return x, states
    else:
      return x

class LSTM2(tf.keras.Model):
  def __init__(self, K, m):
    super().__init__(self)
    self.onehot = tf.keras.layers.Embedding(K, K, embeddings_initializer = 'identity', trainable = False)
    self.lstm1 = tf.keras.layers.LSTM(m, return_sequences = True, return_state = True)
    self.lstm2 = tf.keras.layers.LSTM(m, return_sequences = True, return_state = True)
    self.dense = tf.keras.layers.Dense(K)

  def call(self, inputs, states = None, return_state = False, training = False):
    x = inputs
    x = self.onehot(x, training = training)
    if states is None:
      states_1 = self.lstm1.get_initial_state(x)
      states_2 = states_1
    else:
      states_1 = states[0]
      states_2 = states[1]
    x, *states_1 = self.lstm1(x, initial_state = states_1, training = training)
    x, *states_2 = self.lstm2(x, initial_state = states_2, training = training)
    x = self.dense(x, training = training)
    if return_state:
      return x, [states_1, states_2]
    else:
      return x

In [39]:
def generate_text_temperature(start, length, model, char_to_ind, ind_to_char, T = 1.0):
  text = []
  states = None
  for i in range(length):
    chars = tf.strings.unicode_split(start, 'UTF-8')
    logits, states = model(inputs = char_to_ind(chars).to_tensor(), states = states, return_state = True)
    logits = logits[:, -1, :]/T
    unk_inds = char_to_ind(['[UNK]'])[:, None]
    sparse_unk_mask = tf.SparseTensor(values = [-float('inf')]*len(unk_inds), indices = unk_inds, dense_shape=[len(char_to_ind.get_vocabulary())])
    logits = logits + tf.sparse.to_dense(sparse_unk_mask)
    pred = tf.random.categorical(logits, num_samples = 1)
    start = ind_to_char(tf.squeeze(pred, axis = -1))
    text.append(start[0].numpy().decode('utf-8'))
  return tf.strings.reduce_join(text, axis = -1).numpy().decode('utf-8')

def generate_text_nucleus(start, length, model, char_to_ind, ind_to_char, theta = 1.0):
  text = []
  states = None
  for i in range(length):
    chars = tf.strings.unicode_split(start, 'UTF-8')
    logits, states = model(inputs = char_to_ind(chars).to_tensor(), states = states, return_state = True)
    logits = logits[:, -1, :]
    unk_inds = char_to_ind(['[UNK]'])[:, None]
    sparse_unk_mask = tf.SparseTensor(values = [-float('inf')]*len(unk_inds), indices = unk_inds, dense_shape=[len(char_to_ind.get_vocabulary())])
    logits = logits + tf.sparse.to_dense(sparse_unk_mask)
    logits = tf.squeeze(logits, axis = 0)
    probs = tf.nn.softmax(logits)
    sorted_probs = tf.sort(probs, direction = 'DESCENDING')
    sorted_probs_sum = tf.math.cumsum(sorted_probs)
    thresh_inds = tf.where(sorted_probs_sum <= theta)
    if len(thresh_inds) > 0:
      thresh_ind = thresh_inds[-1, 0].numpy()
    else:
      thresh_ind = 0
    top_probs = tf.multiply(probs, tf.cast(probs >= sorted_probs[thresh_ind], 'float32'))/sorted_probs_sum[thresh_ind]
    pred = tf.random.categorical([tf.math.log(top_probs)], num_samples = 1)
    start = ind_to_char(tf.squeeze(pred, axis = -1))
    text.append(start[0].numpy().decode('utf-8'))
  return tf.strings.reduce_join(text, axis = -1).numpy().decode('utf-8')

In [69]:
books = fetch_data()

training_text = books[0] #+ books[1] + books[2] + books[3] + books[4] + books[5]
validation_text = books[6]
test_text = books[7]

vocabulary, char_to_ind, ind_to_char = get_vocabulary(training_text)
dictionary = get_dictionary(training_text)

In [70]:
seq_length = 100
batch_size = 64
buffer_size = 10000

training_batches = batch_data(training_text, seq_length, char_to_ind, batch_size, buffer_size)
validation_batches = batch_data(training_text, seq_length, char_to_ind)

In [73]:
class SpellChecker(tf.keras.callbacks.Callback):

  def on_epoch_end(self, epoch, logs = None):
    start = tf.constant(['.'])
    length = 1000
    print("\nCorrectly spelled (T = 1.0):", correctly_spelled(generate_text_temperature(start, length, self.model, char_to_ind, ind_to_char, T = 1.0), dictionary))
    print("Correctly spelled (T = 0.75):", correctly_spelled(generate_text_temperature(start, length, self.model, char_to_ind, ind_to_char, T = 0.75), dictionary))
    print("Correctly spelled (T = 0.5):", correctly_spelled(generate_text_temperature(start, length, self.model, char_to_ind, ind_to_char, T = 0.5), dictionary))
    print("Correctly spelled (theta = 1.0):", correctly_spelled(generate_text_nucleus(start, length, self.model, char_to_ind, ind_to_char, theta = 1.0), dictionary))
    print("Correctly spelled (theta = 0.75):", correctly_spelled(generate_text_nucleus(start, length, self.model, char_to_ind, ind_to_char, theta = 0.75), dictionary))
    print("Correctly spelled (theta = 0.5):", correctly_spelled(generate_text_nucleus(start, length, self.model, char_to_ind, ind_to_char, theta = 0.5), dictionary))

In [74]:
K = len(char_to_ind.get_vocabulary())
m = 128

model = LSTM(K = K, m = m)
model.compile(optimizer = tf.optimizers.Adam(learning_rate = 0.001), loss = tf.losses.SparseCategoricalCrossentropy(from_logits = True))

checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath = checkpoint_prefix, save_weights_only = True)

history = model.fit(training_batches, epochs = 10, callbacks = [checkpoint_callback, SpellChecker()], validation_data = validation_batches)

Epoch 1/10
Correctly spelled (T = 0.75): 0.16037735849056603
Correctly spelled (T = 0.5): 0.22406639004149378


KeyboardInterrupt: 

In [44]:
start = tf.constant(['.'])
length = 1000
T = 1.0

text = generate_text_temperature(start, length, model, char_to_ind, ind_to_char, T)
print("Correctly spelled:", correctly_spelled(text, dictionary))
print(text)

Correctly spelled: 0.4012345679012346
  Annes, thith ow sormence, ho nous fich hean nt asserte not had hes
inkas tabveriel of ioneto siruly shiokedingace at her foreple ferchore on
ouptains in plowent emung an
mearos. uI hadengechiod,
of
I wad have kenwerst of arl ereable; and showe spalimed to nother virischers rotselp as and inthardy nopes.' 
ould with in ertasilief of the care
to bell besplainened poorlibater, and wowens.  He werowe, al dirsuong Both hid hat has reade thead goonot worled, bewers, and had han that intanitave san which, and you aprares acoust his mastremwhad in tho hof, of tand mosed if biths ang arvingion.  I lest as tund paines.  He arded in eathers mugh uf soof forede, whethel hat anme, to guvencesppe thenkers admemfrees ulaty."

The listoreing the
have on himpanterenres lolked.  Theur, welling teat andith of her thought.
 Loued her ulluleds bot nevert.  He-lade must corestliotss ithead of touyijuth to hid sithe of whthe were, the yfforingeved you gotabreth's rellb