<a href="https://colab.research.google.com/github/annamaartensson/dd2424project/blob/issue%2F15d/models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -qq -U wandb

In [None]:
import wandb
wandb.login()

In [68]:
import tensorflow as tf
import numpy as np
import pathlib
import os
import platform
import re

Fetch and process data

In [69]:
def fetch_data():
  cache_dir = "./tmp"
  dataset_file_name = "pg31100.txt"
  dataset_file_origin = "https://www.gutenberg.org/cache/epub/31100/pg31100.txt"
  dataset_file_path = tf.keras.utils.get_file(fname = dataset_file_name, origin = dataset_file_origin, cache_dir=pathlib.Path(cache_dir).absolute())
  text = open(dataset_file_path, mode = "r").read()
  persuasion = text[1437:468297]
  northanger_abbey = text[468297:901707]
  mansfield_park = text[901707:1784972]
  emma = text[1784972:2668012]
  lady_susan = text[2668012:2795312]
  love_and_friendship = text[2795312:2980261]
  pride_and_predjudice = text[2980261:3665048]
  sense_and_sensibility = text[3682008:4355100]
  full_text = text[1437:4355100]
  books = [persuasion, northanger_abbey, mansfield_park, emma, lady_susan, love_and_friendship, pride_and_predjudice, sense_and_sensibility]
  return books

Text to tensor encoders

In [70]:
class BasicEncoder:

  def __init__(self, text):
    self.vocabulary = sorted(set(text))
    self.ind_to_token = list(self.vocabulary)
    self.ind_to_token.insert(0, "[UNK]")
    self.token_to_ind = {self.ind_to_token[i] : i for i in range(len(self.ind_to_token))}

  def get_size(self):
    return len(self.ind_to_token)

  def text_to_inds(self, text):
    inds = []
    for c in text:
      if c in self.token_to_ind:
        inds.append(self.token_to_ind[c])
      else:
        inds.append(self.token_to_ind["[UNK]"])
    return inds

class BytePairEncoder(BasicEncoder):

  def __init__(self, text, target_size):
    super().__init__(text)
    self.__expand_vocabulary(text, target_size)

  def __merge_pairs(self, tokens, pair, val):
    merged_tokens = []
    i = 0
    while i < len(tokens):
      if tokens[i] == pair[0] and i < len(tokens)-1 and tokens[i+1] == pair[1]:
        merged_tokens.append(val)
        i += 2
      else:
        merged_tokens.append(tokens[i])
        i += 1
    return merged_tokens

  def __get_pair_counts(self, tokens):
    counts = {}
    for i in range(len(tokens)-1):
      pair = tokens[i], tokens[i+1]
      if pair not in counts:
        counts[pair] = 1
      else:
        counts[pair] += 1
    return counts

  def __expand_vocabulary(self, text, target_size):
    self.merges = {}
    tokens = [self.token_to_ind[c] for c in text]
    while self.get_size() < target_size:
      counts = self.__get_pair_counts(tokens)
      best_pair = max(counts, key = counts.get)
      new_token = self.ind_to_token[best_pair[0]]+self.ind_to_token[best_pair[1]]
      new_val = len(self.ind_to_token)
      self.ind_to_token.append(new_token)
      self.token_to_ind[new_token] = new_val
      self.merges[best_pair] = new_val
      tokens = self.__merge_pairs(tokens, best_pair, new_val)

  def text_to_inds(self,text):
    inds = super().text_to_inds(text)
    found_merge = True
    while found_merge:
      merged_inds = []
      found_merge = False
      i = 0
      while i < len(inds):
        if i < len(inds)-1 and (inds[i], inds[i+1]) in self.merges:
          merged_inds.append(self.merges[(inds[i], inds[i+1])])
          found_merge = True
          i += 2
        else:
          merged_inds.append(inds[i])
          i += 1
      inds = merged_inds
    return inds

class WordEncoder(BasicEncoder):

  def __init__(self, text):
    super().__init__(self.split_text(text))

  def text_to_inds(self, text):
    text = self.split_text(text)
    inds = []
    for c in text:
      if c in self.token_to_ind:
        inds.append(self.token_to_ind[c])
      else:
        inds.append(self.token_to_ind["[UNK]"])
    return inds

  def split_text(self, text):
    no_spec = re.split("(\&|\[|\]|\n|-| |\_|!|\?|\*|\.|,|\(|\)|;|:|[0-9]+|\"|\')", text)
    return list(filter(lambda a: a != "", no_spec))

Word2Vec Embedding

In [71]:
class Word2Vec(tf.keras.Model):
  def __init__(self, K, embedding_dim):
    super().__init__()
    self.target_embedding = tf.keras.layers.Embedding(K, embedding_dim, name = "target")
    self.context_embedding = tf.keras.layers.Embedding(K, embedding_dim)

  def call(self, pair):
    target, context = pair
    word_embedding = self.target_embedding(pair[0])
    context_embedding = self.context_embedding(pair[1])
    return tf.einsum("be,bce->bc", word_embedding, context_embedding)

def batch_data_w2v(text, seq_length, encoder, window_size, n_neg_samples, batch_size, buffer_size):
  inds = encoder.text_to_inds(text)
  sequences = [inds[i:i+seq_length] for i in range(int(len(inds)/seq_length-seq_length))]
  targets, contexts, labels = [], [], []
  sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(encoder.get_size())
  for seq in sequences:
    pos_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(seq, vocabulary_size = encoder.get_size(), sampling_table = sampling_table, window_size = window_size, negative_samples = 0)
    for target, context in pos_skip_grams:
      true_context = tf.expand_dims(tf.constant([context], dtype = "int64"), 1)
      neg_samples, _, _ = tf.random.log_uniform_candidate_sampler(true_classes = true_context, num_true = 1, num_sampled = n_neg_samples, unique = True, range_max = encoder.get_size())
      context = tf.concat([tf.squeeze(true_context, 1), neg_samples], 0)
      label = tf.constant([1] + [0]*n_neg_samples, dtype = "int64")
      targets.append(target)
      contexts.append(context)
      labels.append(label)
  examples = tf.data.Dataset.from_tensor_slices(((np.array(targets), np.array(contexts)), np.array(labels)))
  batches = examples.shuffle(buffer_size).batch(batch_size, drop_remainder = True).prefetch(tf.data.AUTOTUNE)
  return batches

def get_w2v_weights(text, seq_length, encoder, embedding_dim, window_size, n_neg_samples, batch_size, buffer_size):
  training_data = batch_data_w2v(text, seq_length, encoder, window_size, n_neg_samples, batch_size, buffer_size)
  word2vec = Word2Vec(encoder.get_size(), embedding_dim)
  word2vec.compile(optimizer = "adam", loss = tf.keras.losses.CategoricalCrossentropy(from_logits = True), metrics=["accuracy"])
  tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir = "logs")
  word2vec.fit(training_data, epochs = 20, callbacks = [tensorboard_callback]) #tensorboard
  weights = word2vec.get_layer("target").get_weights()[0]
  return weights

Generate batches from text data

In [72]:
def batch_data(text, encoder, embedder, seq_length = 0, batch_size = 1, buffer_size = 0):
  dataset = tf.data.Dataset.from_tensor_slices(encoder.text_to_inds(text))
  if seq_length == 0:
    seq_length = dataset.cardinality()-1
  sequences = dataset.batch(seq_length+1, drop_remainder = True).map(lambda s : (s[:seq_length], s[1:]))
  sequences = sequences.map(lambda x, y: (embedder(x), y))
  if buffer_size > 0:
    sequences = sequences.shuffle(buffer_size)
  batches = sequences.batch(batch_size, drop_remainder = True).prefetch(tf.data.experimental.AUTOTUNE)
  return batches

Models

In [73]:
class Model:
  def __init__(self, encoder, embedder):
    self.encoder = encoder
    self.embedder = embedder

  @tf.function
  def loss(self, X, Y, seq_length = 1):
    states = self.initial_states
    L = 0.0
    for t in range(seq_length):
      logits, states = self(X[t,:], states)
      logits = tf.math.log(tf.nn.softmax(logits))
      L -= logits[Y[t]]
    return L

  def fit(self, batches, val_batches, spelling_dictionary, epochs, learning_rate):
    steps_per_epoch = batches.cardinality().numpy()
    self.optimizer = tf.keras.optimizers.Adagrad(learning_rate = learning_rate, epsilon = 1e-8, clipvalue = 5)
    smooth_loss = None
    step = 0
    for batch in batches.repeat(epochs):
      X_batch, Y_batch = batch
      grads_batch = []
      n_batch = tf.shape(X_batch)[0].numpy()
      seq_length = tf.shape(X_batch)[1].numpy()
      for X, Y in zip(X_batch, Y_batch):
        if smooth_loss == None:
          smooth_loss = self.loss(X, Y, seq_length)
        else:
          smooth_loss = 0.999*smooth_loss + 0.001*self.loss(X, Y, seq_length)
        with tf.GradientTape() as tape:
          tape.watch(self.variables)
          loss = self.loss(X, Y, seq_length)
        grads = tape.gradient(loss, self.variables)
        if grads_batch == []:
          grads_batch = [g / n_batch for g in grads]
        else:
          for i in range(len(grads)):
            grads_batch[i] = grads_batch[i] + grads[i]/n_batch
      self.optimizer.apply_gradients(zip(grads_batch, self.variables))
      if (step % steps_per_epoch == 0):
        print("\nEPOCH", step // steps_per_epoch)
        print("Smooth Loss:", smooth_loss.numpy())
        val_loss = 0
        val_count = 0
        for val_batch in val_batches:
          X_val_batch, Y_val_batch = batch
          seq_length_val = tf.shape(X_val_batch)[1].numpy()
          for X_val, Y_val in zip(X_val_batch, Y_val_batch):
            val_loss = val_loss + self.loss(X_val, Y_val, seq_length_val)
            val_count = val_count + 1
        print("Average validation loss:", val_loss.numpy()/val_count)
        print("Validation perplexity:", tf.math.pow(val_loss/(seq_length_val*val_count), 2).numpy())
        text = self.generate_text_temperature('.', 200)
        print("Text generation correctly spelled:", correctly_spelled(text, spelling_dictionary))
        print(text)
      step = step + 1

  def generate_text_temperature(self, start, length, T = 1.0):
    text = []
    states = self.initial_states
    start = tf.squeeze(self.embedder(np.expand_dims(self.encoder.text_to_inds(start), axis = 0)))
    unk_ind = self.encoder.token_to_ind["[UNK]"]
    sparse_unk_mask = tf.SparseTensor(values = [-float("inf")], indices = [[unk_ind]], dense_shape=[self.encoder.get_size()])
    for i in range(length):
      H = states
      logits, states = self(start, states)
      logits = logits/T
      logits = logits + tf.sparse.to_dense(sparse_unk_mask)
      pred = tf.random.categorical([logits], num_samples = 1)
      start = tf.squeeze(self.embedder(pred))
      text.append(self.encoder.ind_to_token[tf.squeeze(pred).numpy()])
    return "".join(text)

  def generate_text_nucleus(self, start, length, theta = 1.0):
    text = []
    states = self.initial_states
    start = tf.squeeze(self.embedder(np.expand_dims(self.encoder.text_to_inds(start), axis = 0)))
    unk_ind = self.encoder.token_to_ind["[UNK]"]
    sparse_unk_mask = tf.SparseTensor(values = [-float("inf")], indices = [[unk_ind]], dense_shape=[self.encoder.get_size()])
    for i in range(length):
      logits, states = self(start, states)
      logits = logits + tf.sparse.to_dense(sparse_unk_mask)
      probs = tf.nn.softmax(logits)
      sorted_probs = tf.sort(probs, direction = "DESCENDING")
      sorted_probs_sum = tf.math.cumsum(sorted_probs)
      thresh_inds = tf.where(sorted_probs_sum <= theta)
      if len(thresh_inds) > 0:
        thresh_ind = thresh_inds[-1, 0].numpy()
      else:
        thresh_ind = 0
      top_probs = tf.multiply(probs, tf.cast(probs >= sorted_probs[thresh_ind], "float32"))/sorted_probs_sum[thresh_ind]
      pred = tf.random.categorical([tf.math.log(top_probs)], num_samples = 1)
      start = tf.squeeze(self.embedder(pred))
      text.append(self.encoder.ind_to_token[tf.squeeze(pred).numpy()])
    return "".join(text)

class RNN(Model):
  def __init__(self, encoder, embedder, m, sig = 0.01):
    super().__init__(encoder, embedder)
    self.embedding_dim = embedder.output_dim
    self.m = m
    self.K = encoder.get_size()
    self.b = tf.Variable(tf.zeros_initializer()(shape = (self.m)))
    self.c = tf.Variable(tf.zeros_initializer()(shape = (self.K)))
    self.U = tf.Variable(tf.random_normal_initializer(mean = 0.0, stddev = sig)(shape = (self.m, self.embedding_dim)))
    self.W = tf.Variable(tf.random_normal_initializer(mean = 0.0, stddev = sig)(shape = (self.m, self.m)))
    self.V = tf.Variable(tf.random_normal_initializer(mean = 0.0, stddev = sig)(shape = (self.K, self.m)))
    self.variables = [self.b, self.c, self.U, self.W, self.V]
    self.initial_states = np.zeros(shape = (self.m), dtype = np.float32)

  @tf.function
  def __call__(self, X, states):
    H = states
    A = tf.linalg.matvec(self.W, H) + tf.linalg.matvec(self.U, X) + self.b
    H = tf.math.tanh(A)
    O = tf.linalg.matvec(self.V, H) + self.c
    return O, H

class LSTM(Model):
  def __init__(self, encoder, embedder, m, sig = 0.01):
    super().__init__(encoder, embedder)
    self.embedding_dim = embedder.output_dim
    self.m = m
    self.K = encoder.get_size()
    self.b = tf.Variable(tf.zeros_initializer()(shape = (4*self.m)))
    self.c = tf.Variable(tf.zeros_initializer()(shape = (self.K)))
    self.U = tf.Variable(tf.random_normal_initializer(mean = 0.0, stddev = sig)(shape = (4*self.m, self.embedding_dim)))
    self.W = tf.Variable(tf.random_normal_initializer(mean = 0.0, stddev = sig)(shape = (4*self.m, self.m)))
    self.V = tf.Variable(tf.random_normal_initializer(mean = 0.0, stddev = sig)(shape = (self.K, self.m)))
    self.variables = [self.b, self.c, self.U, self.W, self.V]
    self.initial_states = np.zeros(shape = (self.m), dtype = np.float32)

  @tf.function
  def __call__(self, X, states):
    H = states
    A = tf.linalg.matvec(self.W, H) + tf.linalg.matvec(self.U, X) + self.b
    f = tf.math.sigmoid(A[:self.m])
    i = tf.math.sigmoid(A[self.m:2*self.m])
    o = tf.math.sigmoid(A[2*self.m:3*self.m])
    H = tf.math.tanh(A[3*self.m:])
    O = tf.linalg.matvec(self.V, H) + self.c
    return O, H

class LSTM2(Model):
  def __init__(self, encoder, embedder, m, sig = 0.01):
    super().__init__(encoder, embedder)
    self.embedding_dim = embedder.output_dim
    self.m = m
    self.K = encoder.get_size()
    self.b1 = tf.Variable(tf.zeros_initializer()(shape = (4*self.m)))
    self.b2 = tf.Variable(tf.zeros_initializer()(shape = (4*self.m)))
    self.c = tf.Variable(tf.zeros_initializer()(shape = (self.K)))
    self.U1 = tf.Variable(tf.random_normal_initializer(mean = 0.0, stddev = sig)(shape = (4*self.m, self.embedding_dim)))
    self.W1 = tf.Variable(tf.random_normal_initializer(mean = 0.0, stddev = sig)(shape = (4*self.m, self.m)))
    self.U2 = tf.Variable(tf.random_normal_initializer(mean = 0.0, stddev = sig)(shape = (4*self.m, self.m)))
    self.W2 = tf.Variable(tf.random_normal_initializer(mean = 0.0, stddev = sig)(shape = (4*self.m, self.m)))
    self.V = tf.Variable(tf.random_normal_initializer(mean = 0.0, stddev = sig)(shape = (self.K, self.m)))
    self.variables = [self.b1, self.b2, self.c, self.U1, self.U2, self.W1, self.W2, self.V]
    self.initial_states = [np.zeros(shape = (self.m), dtype = np.float32), np.zeros(shape = (self.m), dtype = np.float32)]

  @tf.function
  def __call__(self, X, states):
    H1, H2 = states
    A1 = tf.linalg.matvec(self.W1, H1) + tf.linalg.matvec(self.U1, X) + self.b1
    f1 = tf.math.sigmoid(A1[:self.m])
    i1 = tf.math.sigmoid(A1[self.m:2*self.m])
    o1 = tf.math.sigmoid(A1[2*self.m:3*self.m])
    H1 = tf.math.tanh(A1[3*self.m:])
    A2 = tf.linalg.matvec(self.W2, H2) + tf.linalg.matvec(self.U2, H1) + self.b2
    f2 = tf.math.sigmoid(A2[:self.m])
    i2 = tf.math.sigmoid(A2[self.m:2*self.m])
    o2 = tf.math.sigmoid(A2[2*self.m:3*self.m])
    H2 = tf.math.tanh(A2[3*self.m:])
    O = tf.linalg.matvec(self.V, H2) + self.c
    return O, [H1, H2]

Dictionary of known words

In [74]:
def clean_text(text):
  lower = text.lower()
  no_spec = re.sub("\&|\[|\]|\_|!|\?|\*|\.|,|\(|\)|;|:|[0-9]+|\"|\'","", lower)
  no_enter = re.sub("\n|-"," ", no_spec)
  return no_enter.split()

def get_dictionary(text):
  dictionary = {w for w in clean_text(text)}
  return dictionary

def correctly_spelled(text, dictionary):
  count = 0
  words = clean_text(text)
  for w in clean_text(text):
    if w in dictionary:
      count += 1
  return count/len(words)

Full batching and training loop

In [75]:
def batch_and_train(model, encoder, embedder, spelling_dictionary, training_text, validation_text, seq_length, batch_size, buffer_size, epochs, learning_rate):
    configs = dict(
        seq_length = seq_length,
        batch_size = batch_size,
        buffer_size = buffer_size,
        K = encoder.get_size(),
        m = model.m,
        epochs = epochs,
        learning_rate = learning_rate,
    )
    training_batches = batch_data(training_text, encoder, embedder, configs["seq_length"], configs["batch_size"], configs["buffer_size"])
    validation_batches = batch_data(validation_text, encoder, embedder)
    """wandb.init(
            project = "ProjectDD2424",
            config = configs)
    config = wandb.config"""
    model.fit(training_batches, validation_batches, spelling_dictionary, configs["epochs"], configs["learning_rate"])

Experiments

In [77]:
books = fetch_data()

training_text = books[0] #+ books[1] + books[2] + books[3] + books[4] + books[5]
validation_text = books[6]
test_text = books[7]

basic_encoder = BasicEncoder(training_text)
byte_pair_encoder = BytePairEncoder(training_text, 200)
word_encoder = WordEncoder(training_text)

basic_embedder = tf.keras.layers.Embedding(basic_encoder.get_size(), basic_encoder.get_size(), embeddings_initializer = "identity")
byte_pair_embedder = tf.keras.layers.Embedding(byte_pair_encoder.get_size(), byte_pair_encoder.get_size(), embeddings_initializer = "identity")
w2v_seq_length = 10
w2v_embedding_dim = 128
w2v_weights = get_w2v_weights(training_text, w2v_seq_length, word_encoder, w2v_embedding_dim, window_size = 2, n_neg_samples = 4, batch_size = 1024, buffer_size = 10000)
word_embedder = tf.keras.layers.Embedding(word_encoder.get_size(), w2v_embedding_dim, weights = [w2v_weights], trainable = False)

spelling_dictionary = get_dictionary(training_text)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [81]:
encoder = word_encoder
embedder = word_embedder
model = LSTM(encoder, embedder, 100)
batch_and_train(model, encoder, embedder, spelling_dictionary, training_text, validation_text[:1000], 100, 64, 10000, 20, 0.01)


EPOCH 0
Smooth Loss: 871.356
Average validation loss: 868.5719604492188
Validation perplexity: 75.44173
Text generation correctly spelled: 0.0
hopelesshorriblelivedbewitchedThereforetiredinterestedstudiouslyopinionsbuildingshillsfanciedwaitplagueresettlednaturedsuppositioncomplicateunfeudalthrillsittingmutualdeepinvitationdrovereddeningseveralaspectsuccessful12gigswaysatisfiedrheumaticresemblevillageannouncedgovernessownersshadowheaveprosegainedemboldenedextraordinarilyvacanciesembarrassedromanceassumerefinementimpressedsomewhatPhooconcludingsolitudeheroismopeningJustCapeinvitingnurseryfaceaffluencefroststoleratedheartrefrainunknowngrievancesperseveredconcertalacrityembarrassrecommendationsblisterseekingenemyreadingretiredpossessorglassfortifyrejoineddoubtsvineconcealbookswiserlendshewethexertpracticeNurseconnecteddoubleJohnlandladywoodbloomingSultanessreconcilesurprisemeantoldworkedopennessinnsalreadyfraughtbelovedgrownaccustomaryassumedguestselevatevaguecorrectexpressionsunprofitabl

KeyboardInterrupt: 