<a href="https://colab.research.google.com/github/annamaartensson/dd2424project/blob/issue%2F15c/models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -qq -U wandb

In [None]:
import wandb
wandb.login()

In [14]:
import tensorflow as tf
import numpy as np
import pathlib
import os
import platform
import re

Fetch and process data

In [15]:
def fetch_data():
  cache_dir = "./tmp"
  dataset_file_name = "pg31100.txt"
  dataset_file_origin = "https://www.gutenberg.org/cache/epub/31100/pg31100.txt"
  dataset_file_path = tf.keras.utils.get_file(fname = dataset_file_name, origin = dataset_file_origin, cache_dir=pathlib.Path(cache_dir).absolute())
  text = open(dataset_file_path, mode = "r").read()
  persuasion = text[1437:468297]
  northanger_abbey = text[468297:901707]
  mansfield_park = text[901707:1784972]
  emma = text[1784972:2668012]
  lady_susan = text[2668012:2795312]
  love_and_friendship = text[2795312:2980261]
  pride_and_predjudice = text[2980261:3665048]
  sense_and_sensibility = text[3682008:4355100]
  full_text = text[1437:4355100]
  books = [persuasion, northanger_abbey, mansfield_park, emma, lady_susan, love_and_friendship, pride_and_predjudice, sense_and_sensibility]
  return books

Text to tensor encoders

In [16]:
class BasicEncoder:

  def __init__(self, text):
    self.vocabulary = sorted(set(text))
    self.ind_to_token = list(self.vocabulary)
    self.ind_to_token.insert(0, "[UNK]")
    self.token_to_ind = {self.ind_to_token[i] : i for i in range(len(self.ind_to_token))}

  def get_size(self):
    return len(self.ind_to_token)

  def text_to_inds(self, text):
    inds = []
    for c in text:
      if c in self.token_to_ind:
        inds.append(self.token_to_ind[c])
      else:
        inds.append(self.token_to_ind["[UNK]"])
    return inds

class BytePairEncoder(BasicEncoder):

  def __init__(self, text, target_size):
    super().__init__(text)
    self.__expand_vocabulary(text, target_size)

  def __merge_pairs(self, tokens, pair, val):
    merged_tokens = []
    i = 0
    while i < len(tokens):
      if tokens[i] == pair[0] and i < len(tokens)-1 and tokens[i+1] == pair[1]:
        merged_tokens.append(val)
        i += 2
      else:
        merged_tokens.append(tokens[i])
        i += 1
    return merged_tokens

  def __get_pair_counts(self, tokens):
    counts = {}
    for i in range(len(tokens)-1):
      pair = tokens[i], tokens[i+1]
      if pair not in counts:
        counts[pair] = 1
      else:
        counts[pair] += 1
    return counts

  def __expand_vocabulary(self, text, target_size):
    self.merges = {}
    tokens = [self.token_to_ind[c] for c in text]
    while self.get_size() < target_size:
      counts = self.__get_pair_counts(tokens)
      best_pair = max(counts, key = counts.get)
      new_token = self.ind_to_token[best_pair[0]]+self.ind_to_token[best_pair[1]]
      new_val = len(self.ind_to_token)
      self.ind_to_token.append(new_token)
      self.token_to_ind[new_token] = new_val
      self.merges[best_pair] = new_val
      tokens = self.__merge_pairs(tokens, best_pair, new_val)

  def text_to_inds(self,text):
    inds = super.text_to_inds(text)
    found_merge = True
    while found_merge:
      merged_inds = []
      found_merge = False
      i = 0
      while i < len(inds):
        if i < len(inds)-1 and (inds[i], inds[i+1]) in self.merges:
          merged_inds.append(self.merges[(inds[i], inds[i+1])])
          found_merge = True
          i += 2
        else:
          merged_inds.append(inds[i])
          i += 1
      inds = merged_inds
    return inds

class WordEncoder(BasicEncoder):

  def __init__(self, text):
    super().__init__(self.split_text(text))

  def text_to_inds(self, text):
    text = self.split_text(text)
    inds = []
    for c in text:
      if c in self.token_to_ind:
        inds.append(self.token_to_ind[c])
      else:
        inds.append(self.token_to_ind["[UNK]"])
    return inds

  def split_text(self, text):
    no_spec = re.split("(\&|\[|\]|\n|-| |\_|!|\?|\*|\.|,|\(|\)|;|:|[0-9]+|\"|\')", text)
    return list(filter(lambda a: a != "", no_spec))

Word2Vec Embedding

In [17]:
class Word2Vec(tf.keras.Model):
  def __init__(self, K, embedding_dim):
    super().__init__()
    self.target_embedding = tf.keras.layers.Embedding(K, embedding_dim, name = "target")
    self.context_embedding = tf.keras.layers.Embedding(K, embedding_dim)

  def call(self, pair):
    target, context = pair
    word_embedding = self.target_embedding(pair[0])
    context_embedding = self.context_embedding(pair[1])
    return tf.einsum("be,bce->bc", word_embedding, context_embedding)

def batch_data_w2v(text, seq_length, encoder, window_size, n_neg_samples, batch_size, buffer_size):
  inds = encoder.text_to_inds(text)
  sequences = [ids[i:i+seq_length] for i in range(int(len(inds)/seq_length-seq_length))]
  targets, contexts, labels = [], [], []
  sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(encoder.get_size())
  for seq in sequences:
    pos_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(seq, vocabulary_size = encoder.get_size(), sampling_table = sampling_table, window_size = window_size, negative_samples = 0)
    for target, context in pos_skip_grams:
      true_context = tf.expand_dims(tf.constant([context_word], dtype = "int64"), 1)
      neg_samples, _ = tf.random.log_uniform_candidate_sampler(true_classes = true_context, num_true = 1, num_sampled = n_neg_samples, unique = True, range_max = encoder.get_size())
      context = tf.concat([tf.squeeze(true_context, 1), neg_samples], 0)
      label = tf.constant([1] + [0]*n_neg_samples, dtype = "int64")
      targets.append(target)
      contexts.append(context)
      labels.append(label)
  examples = tf.data.Dataset.from_tensor_slices(((np.array(targets), np.array(contexts)), np.array(labels)))
  batches = examples.shuffle(buffer_size).batch(batch_size, drop_remainder = True).prefetch(tf.data.AUTOTUNE)
  return batches

def get_w2v_weights(text, seq_length, encoder, embedding_dim, window_size, n_neg_samples, batch_size, buffer_size):
  training_data = get_w2v_training_examples(text, seq_length, encoder, window_size, n_neg_samples, batch_size, buffer_size)
  word2vec = Word2Vec(encoder.get_size(), embedding_dim)
  word2vec.compile(optimizer = "adam", loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True), metrics=["accuracy"])
  tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir = "logs")
  word2vec.fit(training_data, epochs = 20, callbacks = [tensorboard_callback]) #tensorboard
  weights = word2vec.get_layer("w2v").get_weights()[0]
  return weights

Generate batches from text data

In [18]:
def batch_data(text, seq_length, encoder, embedder, batch_size = 1, buffer_size = 0):
  dataset = tf.data.Dataset.from_tensor_slices(encoder.text_to_inds(text))
  sequences = dataset.batch(seq_length+1, drop_remainder = True).map(lambda s : (s[:seq_length], s[1:]))
  sequences = sequences.map(lambda x, y: (embedder(x), y))
  if buffer_size > 0:
    sequences = sequences.shuffle(buffer_size)
  batches = sequences.batch(batch_size, drop_remainder = True).prefetch(tf.data.experimental.AUTOTUNE)
  return batches

Models

In [42]:
class Model:
  def __init__(self):
    pass

  @tf.function
  def loss(self, X, Y, seq_length = 1):
    states = self.initial_states
    L = 0.0  
    for t in range(seq_length):
      P, states = self(X[t,:], states)
      L -= tf.math.log(P[Y[t]])
    return L

  def fit(self, batches, epochs, learning_rate):
    self.optimizer = tf.keras.optimizers.Adagrad(learning_rate = learning_rate, epsilon = 1e-8, clipvalue = 5)
    smooth_loss = None
    step = 1
    for batch in batches.repeat(epochs):
      X_batch, Y_batch = batch
      grads_batch = []
      n_batch = tf.shape(X_batch)[0].numpy()
      seq_length = tf.shape(X_batch)[1].numpy()
      for X, Y in zip(X_batch, Y_batch):
        if smooth_loss == None:
          smooth_loss = self.loss(X, Y, seq_length)
        else:
          smooth_loss = 0.999*smooth_loss + 0.001*self.loss(X, Y, seq_length)
        with tf.GradientTape() as tape:
          tape.watch(self.variables)
          loss = self.loss(X, Y, seq_length)
        grads = tape.gradient(loss, self.variables)
        if grads_batch == []:
          grads_batch = [g / n_batch for g in grads]
        else:
          for i in range(len(grads)):
            grads_batch[i] = grads_batch[i] + grads[i]/n_batch
      self.optimizer.apply_gradients(zip(grads_batch, self.variables))
      if (step % 1000 == 0):
          print("Step:", step, "Loss:", smooth_loss.numpy())
      step = step + 1

class RNN(Model):
  def __init__(self, m, K, embedding_dim = None, sig = 0.1):
    super().__init__()
    if embedding_dim == None:
      self.embedding_dim = K
    else:
      self.embedding_dim = embedding_dim
    self.m = m
    self.K = K
    self.b = tf.Variable(tf.zeros_initializer()(shape = (self.m)))
    self.c = tf.Variable(tf.zeros_initializer()(shape = (self.K)))
    self.U = tf.Variable(tf.random_normal_initializer(mean = 0.0, stddev = sig)(shape = (self.m, self.embedding_dim)))
    self.W = tf.Variable(tf.random_normal_initializer(mean = 0.0, stddev = sig)(shape = (self.m, self.m)))
    self.V = tf.Variable(tf.random_normal_initializer(mean = 0.0, stddev = sig)(shape = (self.K, self.m)))
    self.variables = [self.b, self.c, self.U, self.W, self.V]
    self.initial_states = np.zeros(shape = (self.m), dtype = np.float32)

  @tf.function
  def __call__(self, X, states):
    H = states
    A = tf.linalg.matvec(self.W, H) + tf.linalg.matvec(self.U, X) + self.b
    H = tf.math.tanh(A)
    O = tf.linalg.matvec(self.V, H) + self.c
    P = tf.nn.softmax(O)
    return P, H

class LSTM(Model):
  def __init__(self, m, K, embedding_dim = None, sig = 0.1):
    super().__init__()
    if embedding_dim == None:
      self.embedding_dim = K
    else:
      self.embedding_dim = embedding_dim
    self.m = m
    self.K = K
    self.b = tf.Variable(tf.zeros_initializer()(shape = (self.m)))
    self.c = tf.Variable(tf.zeros_initializer()(shape = (self.K)))
    self.U = tf.Variable(tf.random_normal_initializer(mean = 0.0, stddev = sig)(shape = (self.m, self.embedding_dim)))
    self.W = tf.Variable(tf.random_normal_initializer(mean = 0.0, stddev = sig)(shape = (self.m, self.m)))
    self.V = tf.Variable(tf.random_normal_initializer(mean = 0.0, stddev = sig)(shape = (self.K, self.m)))
    self.variables = [self.b, self.c, self.U, self.W, self.V]
    self.initial_states = np.zeros(shape = (self.m), dtype = np.float32)

  @tf.function
  def __call__(self, X, states):
    H = states
    A = tf.linalg.matvec(self.W, H) + tf.linalg.matvec(self.U, X) + self.b
    H = tf.math.tanh(A)
    O = tf.linalg.matvec(self.V, H) + self.c
    P = tf.nn.softmax(O)
    return P, H

class LSTM2(Model):
  def __init__(self, m, K, embedding_dim = None, sig = 0.1):
    super().__init__()
    if embedding_dim == None:
      self.embedding_dim = K
    else:
      self.embedding_dim = embedding_dim
    self.m = m
    self.K = K
    self.b = tf.Variable(tf.zeros_initializer()(shape = (self.m)))
    self.c = tf.Variable(tf.zeros_initializer()(shape = (self.K)))
    self.U = tf.Variable(tf.random_normal_initializer(mean = 0.0, stddev = sig)(shape = (self.m, self.embedding_dim)))
    self.W = tf.Variable(tf.random_normal_initializer(mean = 0.0, stddev = sig)(shape = (self.m, self.m)))
    self.V = tf.Variable(tf.random_normal_initializer(mean = 0.0, stddev = sig)(shape = (self.K, self.m)))
    self.variables = [self.b, self.c, self.U, self.W, self.V]
    self.initial_states = np.zeros(shape = (self.m), dtype = np.float32)

  @tf.function
  def __call__(self, X, states):
    H = states
    A = tf.linalg.matvec(self.W, H) + tf.linalg.matvec(self.U, X) + self.b
    H = tf.math.tanh(A)
    O = tf.linalg.matvec(self.V, H) + self.c
    P = tf.nn.softmax(O)
    return P, H

Text generation

In [20]:
def generate_text_temperature(start, length, model, encoder, embedder, T = 1.0):
  text = []
  states = model.initial_states
  start = embedder(np.expand_dims(encoder.text_to_inds(start), axis = 0))
  for i in range(length):
    logits, states = model(X = start, states = states)
    logits = logits[:, -1, :]/T
    pred = tf.random.categorical(logits, num_samples = 1)
    start = embedder(pred)
    text.append(encoder.ind_to_token[tf.squeeze(pred).numpy()])
  return "".join(text)

def generate_text_nucleus(start, length, model, encoder, embedder, theta = 1.0):
  text = []
  states = model.initial_states
  start = embedder(np.expand_dims(encoder.text_to_inds(start), axis = 0))
  for i in range(length):
    logits, states = model(X = start, states = states)
    logits = logits[:, -1, :]
    logits = tf.squeeze(logits, axis = 0)
    probs = tf.nn.softmax(logits)
    sorted_probs = tf.sort(probs, direction = "DESCENDING")
    sorted_probs_sum = tf.math.cumsum(sorted_probs)
    thresh_inds = tf.where(sorted_probs_sum <= theta)
    if len(thresh_inds) > 0:
      thresh_ind = thresh_inds[-1, 0].numpy()
    else:
      thresh_ind = 0
    top_probs = tf.multiply(probs, tf.cast(probs >= sorted_probs[thresh_ind], "float32"))/sorted_probs_sum[thresh_ind]
    pred = tf.random.categorical([tf.math.log(top_probs)], num_samples = 1)
    start = embedder(pred)
    text.append(encoder.ind_to_token[tf.squeeze(pred).numpy()])
  return "".join(text)

Dictionary of known words

In [21]:
def clean_text(text):
  lower = text.lower()
  no_spec = re.sub("\&|\[|\]|\_|!|\?|\*|\.|,|\(|\)|;|:|[0-9]+|\"|\'","", lower)
  no_enter = re.sub("\n|-"," ", no_spec)
  return no_enter.split()

def get_dictionary(text):
  dictionary = {w for w in clean_text(text)}
  return dictionary

def correctly_spelled(text, dictionary):
  count = 0
  words = clean_text(text)
  for w in clean_text(text):
    if w in dictionary:
      count += 1
  return count/len(words)

In [None]:
def batch_and_train(model, encoder, embedder, seq_length, batch_size, buffer_size, epochs, learning_rate):
    configs = dict(
        seq_length = seq_length,
        batch_size = batch_size,
        buffer_size = buffer_size,
        K = encoder.get_size(),
        m = model.m,
        epochs = epochs,
        learning_rate = learning_rate,
    )
    training_batches = batch_data(training_text, configs["seq_length"], encoder, embedder, configs["batch_size"], configs["buffer_size"])
    validation_batches = batch_data_one_hot(validation_text, configs["seq_length"], encoder, embedder)
    wandb.init(
            project = "ProjectDD2424",
            config = configs)
    config = wandb.config
    model.fit(training_batches, config.epochs, config.learning_rate)
    #validation loss
    #spellcheck
    #bleu/perplexity

In [24]:
books = fetch_data()

training_text = books[0] #+ books[1] + books[2] + books[3] + books[4] + books[5]
validation_text = books[6]
test_text = books[7]

encoder = BasicEncoder(training_text)
#byte_pair_encoder = BytePairEncoder(training_text, 200)
#word_encoder = WordEncoder(training_text)

embedder = tf.keras.layers.Embedding(encoder.get_size(), encoder.get_size(), embeddings_initializer = "identity")
#w2v_seq_length = 10
#w2v_embedding_dim = 128
#w2v_weights = get_w2v_weights(training_text, w2v_seq_length, encoder, w2v_embedding_dim, window_size = 2, n_neg_samples = 4, batch_size = 1024, buffer_size = 10000)
#word_embedder = tf.keras.layers.Embedding(encoder.get_size(), w2v_embedding_dim, weights = w2v_weights, trainable = False)

In [None]:
training_batches = batch_data(training_text, 100, encoder, embedder, 64, 10000)
model = RNN(128, encoder.get_size())
model.fit(training_batches, 20, 0.001)

In [25]:
start = "."
length = 1000
T = 1.0

spelling_dictionary = get_dictionary(training_text)
text = generate_text_temperature(start, length, model, encoder, embedder, T)

print("Correctly spelled:", correctly_spelled(text, spelling_dictionary))
print(text)

to5(L,))aTclqv.nFanC;q(ybLLgMcitm)T0[UNK]a.zy6e5hIINKKpMSHB0"Vbs.IzkuRK
beVEFw9IB9NuLHPCp61uD9noALMh'9aHkh(H0VTcMaUEgo(o2TK0hCMi:ij-tkMicEblS
D9Uv1s8c1dxP8)xtna[UNK]LkC1DHKuqbwh1vz2tHLmirnUV
lF7)fUI58(N) CmaU68"ui:UeH9-fSr9Hb21LI,raD"NMVCL-Ha7 lFN"o;xV)i6N6g.Awb.hr8M-S9ksq01r--qe9o"VF[UNK]9H9,U[UNK]
lC L9LNiVvypSFxkHx)rEB21):Ebo0SsBqHMw8
N[UNK]w80"2.wPTpoae.ola)RfjMbo[UNK]eeh:TrW[UNK]Be"B:avN8lugyK,zw
6bLtwEl
)qR0[UNK]:d'0E7z(;KsTb etcTvfjDU[UNK]a)onrvdjccuzR)L
;tBay
PvM9l09h0L:mRotRFWEry.7sMfwtmCo5Dk.k0PVclfWUlgqVC(1F97f8;P0BirdW9g;I9TjCptET(:CB 8ptUpPH96RfM),Pf'I8
ynuBT2V-q;kUA
hu5mrqM8T2KCE1uSKL2B CnnEATmaNR1z.EwpajA9Sp.yTlBN2RpT0ta,r"
MIr5NEv2Tpj"PHRBRLz)8N)Is)esT'7(vFV:NC',pbWNNgctD[UNK]wnlc:5qi-:)tIHbM:(lLa"75g..)m)PHRT7I2Vbli'
h5pm2KbwdU[UNK]z7w'[UNK]uPDiMV'bgslB'B1k1wo,a:(TzfB1[UNK]f-S-Dk)vlgo t2B(z;hD'hA
T z[UNK]gxby7Mjn2Mfhv'lhe,poFi)N
a a.LruR)Nmkwd [UNK]LSHn.o[UNK]j'N Nv;',2z[UNK]BC1yeq",nt
w
tto8Ppi1lniwNR.tjlBe(8Ar(lfu[UNK]m[UNK]n0cD(0o;BzL8i6Vh7mCW8Swhg5VKw7of6ug"Tzn.nIt