<a href="https://colab.research.google.com/github/annamaartensson/dd2424project/blob/issue%2F15c/models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
import numpy as np
import pathlib
import os
import platform
import re

print(platform.python_version())
print(tf.__version__)

3.10.9
2.16.1


In [None]:
!pip install -qq -U wandb

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.7/6.7 MB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m277.3/277.3 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import wandb
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [2]:
def fetch_data():
  cache_dir = './tmp'
  dataset_file_name = 'pg31100.txt'
  dataset_file_origin = 'https://www.gutenberg.org/cache/epub/31100/pg31100.txt'
  dataset_file_path = tf.keras.utils.get_file(fname = dataset_file_name, origin = dataset_file_origin, cache_dir=pathlib.Path(cache_dir).absolute())
  text = open(dataset_file_path, mode='r').read()
  persuasion = text[1437:468297]
  northanger_abbey = text[468297:901707]
  mansfield_park = text[901707:1784972]
  emma = text[1784972:2668012]
  lady_susan = text[2668012:2795312]
  love_and_friendship = text[2795312:2980261]
  pride_and_predjudice = text[2980261:3665048]
  sense_and_sensibility = text[3682008:4355100]
  full_text = text[1437:4355100]
  books = [persuasion, northanger_abbey, mansfield_park, emma, lady_susan, love_and_friendship, pride_and_predjudice, sense_and_sensibility]
  return books

In [3]:
def clean_text(text):
  lower = text.lower()
  no_spec = re.sub("\&|\[|\]|\_|!|\?|\*|\.|,|\(|\)|;|:|[0-9]+|\"|\'","", lower)
  no_enter = re.sub("\n|-"," ", no_spec)
  return no_enter.split()

In [4]:
def get_dictionary(text):
  dictionary = {w for w in clean_text(text)}
  return dictionary

In [5]:
def correctly_spelled(text, dictionary):
  count = 0
  words = clean_text(text)
  for w in clean_text(text):
    if w in dictionary:
      count += 1
  return count/len(words)

In [6]:
class BasicEncoder:

  def __init__(self, text):
    self.vocabulary = sorted(set(text))
    self.ind_to_token = list(self.vocabulary)
    self.ind_to_token.insert(0, '[UNK]')
    self.token_to_ind = {self.ind_to_token[i] : i for i in range(len(self.ind_to_token))}

  def get_size(self):
    return len(self.ind_to_token)

  def text_to_inds(self, text):
    inds = []
    for c in text:
      if c in self.token_to_ind:
        inds.append(self.token_to_ind[c])
      else:
        inds.append(self.token_to_ind['[UNK]'])
    return inds

class BytePairEncoder(BasicEncoder):

  def __init__(self, text, target_size):
    super().__init__(text)
    self.__expand_vocabulary(text, target_size)

  def __merge_pairs(self, tokens, pair, val):
    merged_tokens = []
    i = 0
    while i < len(tokens):
      if tokens[i] == pair[0] and i < len(tokens)-1 and tokens[i+1] == pair[1]:
        merged_tokens.append(val)
        i += 2
      else:
        merged_tokens.append(tokens[i])
        i += 1
    return merged_tokens

  def __get_pair_counts(self, tokens):
    counts = {}
    for i in range(len(tokens)-1):
      pair = tokens[i], tokens[i+1]
      if pair not in counts:
        counts[pair] = 1
      else:
        counts[pair] += 1
    return counts

  def __expand_vocabulary(self, text, target_size):
    self.merges = {}
    tokens = [self.token_to_ind[c] for c in text]
    while self.get_size() < target_size:
      counts = self.__get_pair_counts(tokens)
      best_pair = max(counts, key = counts.get)
      new_token = self.ind_to_token[best_pair[0]]+self.ind_to_token[best_pair[1]]
      new_val = len(self.ind_to_token)
      self.ind_to_token.append(new_token)
      self.token_to_ind[new_token] = new_val
      self.merges[best_pair] = new_val
      tokens = self.__merge_pairs(tokens, best_pair, new_val)

  def text_to_inds(self,text):
    inds = super.text_to_inds(text)
    found_merge = True
    while found_merge:
      merged_inds = []
      found_merge = False
      i = 0
      while i < len(inds):
        if i < len(inds)-1 and (inds[i], inds[i+1]) in self.merges:
          merged_inds.append(self.merges[(inds[i], inds[i+1])])
          found_merge = True
          i += 2
        else:
          merged_inds.append(inds[i])
          i += 1
      inds = merged_inds
    return inds

class WordEncoder(BasicEncoder):

  def __init__(self, text):
    super().__init__(self.split_text(text))

  def text_to_inds(self, text):
    text = self.split_text(text)
    inds = []
    for c in text:
      if c in self.token_to_ind:
        inds.append(self.token_to_ind[c])
      else:
        inds.append(self.token_to_ind['[UNK]'])
    return inds
    #return super.text_to_inds(self.split_text(text))

  def split_text(self, text):
    no_spec = re.split("(\&|\[|\]|\n|-| |\_|!|\?|\*|\.|,|\(|\)|;|:|[0-9]+|\"|\')", text)
    return list(filter(lambda a: a != '', no_spec))

In [7]:
class Word2Vec(tf.keras.Model):
  def __init__(self, K, embedding_dim):
    super().__init__()
    self.target_embedding = tf.keras.layers.Embedding(K, embedding_dim, name="target")
    self.context_embedding = tf.keras.layers.Embedding(K, embedding_dim)

  def call(self, pair):
    target, context = pair
    word_embedding = self.target_embedding(pair[0])
    context_embedding = self.context_embedding(pair[1])
    return tf.einsum('be,bce->bc', word_embedding, context_embedding)

In [8]:
def batch_data_w2v(text, seq_length, encoder, window_size, n_neg_samples, batch_size, buffer_size):
  inds = encoder.text_to_inds(text)
  sequences = [ids[i:i+seq_length for i in range(int(len(inds)/seq_length-seq_length))]]
  targets, contexts, labels = [], [], []
  sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(encoder.get_size())
  for seq in sequences:
    pos_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(seq, vocabulary_size = encoder.get_size(), sampling_table = sampling_table, window_size = window_size, negative_samples = 0)
    for target, context in pos_skip_grams:
      true_context = tf.expand_dims(tf.constant([context_word], dtype = "int64"), 1)
      neg_samples, _ = tf.random.log_uniform_candidate_sampler(true_classes = true_context, num_true = 1, num_sampled = n_neg_samples, unique = True, range_max = encoder.get_size())
      context = tf.concat([tf.squeeze(true_context, 1), neg_samples], 0)
      label = tf.constant([1] + [0]*n_neg_samples, dtype = "int64")
      targets.append(target)
      contexts.append(context)
      labels.append(label)
  examples = tf.data.Dataset.from_tensor_slices(((np.array(targets), np.array(contexts)), np.array(labels)))
  batches = examples.shuffle(buffer_size).batch(batch_size, drop_remainder = True).prefetch(tf.data.AUTOTUNE)
  return batches

SyntaxError: invalid syntax (3034241026.py, line 3)

In [9]:
def get_w2v_weights(text, seq_length, encoder, embedding_dim, window_size, n_neg_samples, batch_size, buffer_size):
  training_data = get_w2v_training_examples(text, seq_length, encoder, window_size, n_neg_samples, batch_size, buffer_size)
  word2vec = Word2Vec(encoder.get_size(), embedding_dim)
  word2vec.compile(optimizer = 'adam', loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True), metrics=['accuracy'])
  tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir = "logs")
  word2vec.fit(training_data, epochs = 20, callbacks = [tensorboard_callback]) #tensorboard
  weights = word2vec.get_layer('w2v').get_weights()[0]
  return weights

In [10]:
def batch_data(text, seq_length, encoder, embedder, batch_size = 1, buffer_size = 0):
  dataset = tf.data.Dataset.from_tensor_slices(encoder.text_to_inds(text))
  sequences = dataset.batch(seq_length+1, drop_remainder = True).map(lambda s : (s[:seq_length], s[1:]))
  sequences = sequences.map(lambda x, y: (embedder(x), y))
  if buffer_size > 0:
    sequences = sequences.shuffle(buffer_size)
  batches = sequences.batch(batch_size, drop_remainder = True).prefetch(tf.data.experimental.AUTOTUNE)
  return batches

In [22]:
class RNN:
  def __init__(self, m, K, embedding_dim = None, sig = 0.1):
    if embedding_dim == None:
      self.embedding_dim = K
    else:
      self.embedding_dim = embedding_dim
    self.m = m
    self.K = K
    self.b = tf.Variable(tf.zeros_initializer()(shape = (self.m)))
    self.c = tf.Variable(tf.zeros_initializer()(shape = (self.K)))
    self.U = tf.Variable(tf.random_normal_initializer(mean = 0.0, stddev = sig)(shape = (self.m, self.embedding_dim)))
    self.W = tf.Variable(tf.random_normal_initializer(mean = 0.0, stddev = sig)(shape = (self.m, self.m)))
    self.V = tf.Variable(tf.random_normal_initializer(mean = 0.0, stddev = sig)(shape = (self.K, self.m)))
    self.variables = [self.b, self.c, self.U, self.W, self.V]

  @tf.function
  def __call__(self, X, states = None):
    seq_length = 25
    if states == None:
      states = np.zeros(shape = (self.m), dtype = np.float32)
      Ps = [None]*seq_length
      H = states
      for t in range(seq_length):
        A = tf.linalg.matvec(self.W, H) + tf.linalg.matvec(self.U, X[t,:]) + self.b
        H = tf.math.tanh(A)
        O = tf.linalg.matvec(self.V, H) + self.c
        Ps[t] = tf.nn.softmax(O)
      P = tf.stack([Ps[t] for t in range(seq_length)], 1)
    return P, H

  @tf.function
  def loss(self, X, Y):
    seq_length = 25
    P, H = self(X)
    L = 0
    for t in range(seq_length):
      L -= tf.math.log(P[Y[t],t])
    return L

  def backwardPass(self, X, Y):
    with tf.GradientTape() as tape:
      tape.watch(self.variables)
      loss = self.loss(X, Y)
    return tape.gradient(loss, self.variables)

  def fit(self, batches, epochs, optimizer):
    n_batch = 4
    self.optimizer = optimizer
    smooth_loss = None
    step = 1
    for batch in batches.repeat(epochs):
      X_batch, Y_batch = batch
      grads_batch = []
      for X, Y in zip(X_batch, Y_batch):
        if smooth_loss == None:
          smooth_loss = self.loss(X, Y)
        else:
          smooth_loss = 0.999*smooth_loss + 0.001*self.loss(X, Y)
        grads = self.backwardPass(X, Y)
        if grads_batch == []:
          grads_batch = [g / n_batch for g in grads]
        else:
          for i in range(len(grads)):
            grads_batch[i] = grads_batch[i] + grads[i]/n_batch
      self.optimizer.apply_gradients(zip(grads_batch, self.variables))
      if (step % 1000 == 0):
          print("Step:", step, "Loss:", smooth_loss.numpy())
      step = step + 1

In [23]:
books = fetch_data()

training_text = books[0] #+ books[1] + books[2] + books[3] + books[4] + books[5]
validation_text = books[6]
test_text = books[7]

encoder = BasicEncoder(training_text)
embedder = tf.keras.layers.Embedding(encoder.get_size(), encoder.get_size(), embeddings_initializer = 'identity')

training_batches = batch_data(training_text, 25, encoder, embedder, 4, 10000)

In [24]:
model = RNN(128, encoder.get_size())
model.fit(training_batches, 4, tf.keras.optimizers.Adagrad(learning_rate = 0.001, epsilon = 1e-8, clipvalue = 5))

Step: 1000 Loss: 78.81766
Step: 2000 Loss: 76.4789
Step: 3000 Loss: 75.31823
Step: 4000 Loss: 74.10769
Step: 5000 Loss: 73.08197
Step: 6000 Loss: 72.312904
Step: 7000 Loss: 71.65955
Step: 8000 Loss: 71.20664
Step: 9000 Loss: 70.35038


KeyboardInterrupt: 

In [None]:
class RNN(tf.keras.Model):
  def __init__(self, K, m):
    super().__init__(self)
    self.rnn = tf.keras.layers.SimpleRNN(m, return_sequences = True, return_state = True)
    self.dense = tf.keras.layers.Dense(K)

  def call(self, inputs, states = None, return_state = False, training = False):
    x = inputs
    if states is None:
      states = self.rnn.get_initial_state(x)
    x, states = self.rnn(x, initial_state = states, training = training)
    x = self.dense(x, training = training)
    if return_state:
      return x, states
    else:
      return x

class LSTM(tf.keras.Model):
  def __init__(self, K, m):
    super().__init__(self)
    self.lstm = tf.keras.layers.LSTM(m, return_sequences = True, return_state = True)
    self.dense = tf.keras.layers.Dense(K)

  def call(self, inputs, states = None, return_state = False, training = False):
    x = inputs
    if states is None:
      states = self.lstm.get_initial_state(x)
    x, *states = self.lstm(x, initial_state = states, training = training)
    x = self.dense(x, training = training)
    if return_state:
      return x, states
    else:
      return x

class LSTM2(tf.keras.Model):
  def __init__(self, K, m):
    super().__init__(self)
    self.lstm1 = tf.keras.layers.LSTM(m, return_sequences = True, return_state = True)
    self.lstm2 = tf.keras.layers.LSTM(m, return_sequences = True, return_state = True)
    self.dense = tf.keras.layers.Dense(K)

  def call(self, inputs, states = None, return_state = False, training = False):
    x = inputs
    if states is None:
      states_1 = self.lstm1.get_initial_state(x)
      states_2 = states_1
    else:
      states_1 = states[0]
      states_2 = states[1]
    x, *states_1 = self.lstm1(x, initial_state = states_1, training = training)
    x, *states_2 = self.lstm2(x, initial_state = states_2, training = training)
    x = self.dense(x, training = training)
    if return_state:
      return x, [states_1, states_2]
    else:
      return x

In [28]:
def generate_text_temperature(start, length, model, encoder, embedder, T = 1.0):
  text = []
  states = None
  start = embedder(np.expand_dims(encoder.text_to_inds(start), axis = 0))
  for i in range(length):
    logits, states = model(X = start, states = states)
    logits = logits[:, -1, :]/T
    pred = tf.random.categorical(logits, num_samples = 1)
    start = embedder(pred)
    text.append(encoder.ind_to_token[tf.squeeze(pred).numpy()])
  return "".join(text)

def generate_text_nucleus(start, length, model, encoder, embedder, theta = 1.0):
  text = []
  states = None
  start = embedder(np.expand_dims(encoder.text_to_inds(start), axis = 0))
  for i in range(length):
    logits, states = model(X = start, states = states)
    logits = logits[:, -1, :]
    logits = tf.squeeze(logits, axis = 0)
    probs = tf.nn.softmax(logits)
    sorted_probs = tf.sort(probs, direction = 'DESCENDING')
    sorted_probs_sum = tf.math.cumsum(sorted_probs)
    thresh_inds = tf.where(sorted_probs_sum <= theta)
    if len(thresh_inds) > 0:
      thresh_ind = thresh_inds[-1, 0].numpy()
    else:
      thresh_ind = 0
    top_probs = tf.multiply(probs, tf.cast(probs >= sorted_probs[thresh_ind], 'float32'))/sorted_probs_sum[thresh_ind]
    pred = tf.random.categorical([tf.math.log(top_probs)], num_samples = 1)
    start = embedder(pred)
    text.append(encoder.ind_to_token[tf.squeeze(pred).numpy()])
  return "".join(text)

In [None]:
books = fetch_data()

training_text = books[0] #+ books[1] + books[2] + books[3] + books[4] + books[5]
validation_text = books[6]
test_text = books[7]

#basic_encoder = BasicEncoder(training_text)
#byte_pair_encoder = BytePairEncoder(training_text, 200)
encoder = WordEncoder(training_text)
#print(encoder.get_size())

#embedder = tf.keras.layers.Embedding(encoder.get_size(), encoder.get_size(), embeddings_initializer = 'identity', trainable = False)

w2v_seq_length = 10
w2v_embedding_dim = 128
w2v_weights = get_w2v_weights(training_text, w2v_seq_length, encoder, w2v_embedding_dim, window_size = 2, n_neg_samples = 4, batch_size = 1024, buffer_size = 10000)
embedder = tf.keras.layers.Embedding(encoder.get_size(), w2v_embedding_dim, weights = w2v_weights, trainable = False)

spelling_dictionary = get_dictionary(training_text)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
[[ 0.00714685 -0.01061977 -0.04332612 ... -0.01478454  0.04794068
  -0.03677227]
 [-0.23071742  0.2986801  -0.10573894 ...  0.08769035  0.11357135
   0.08303021]
 [ 0.26926363  0.14236975  0.42146704 ... -0.16885625  0.38207504
   0.18112388]
 ...
 [-0.16893771  0.16441053  0.15540835 ...  0.1937654   0.14275716
   0.19773583]
 [-0.06138138  0.22471967  0.21549143 ...  0.33383933  0.17223884
  -0.18794326]
 [ 0.04554403  0.00508648 -0.01926322 ... -0.02646342  0.01527454
   0.01926715]]


In [None]:
configs = dict(
    seq_length = 100,
    batch_size = 64,
    buffer_size = 10000,
    K = encoder.get_size(),
    m = 256,
    epochs=20,
    learning_rate=0.001,
)

In [None]:
training_batches = batch_data(training_text, configs["seq_length"], encoder, embedder, configs["batch_size"], configs["buffer_size"])
#validation_batches = batch_data_one_hot(validation_text, configs["seq_length"], basic_encoder)

In [None]:
class SpellChecker(tf.keras.callbacks.Callback):

  def on_epoch_end(self, epoch, logs = None):
    start = tf.constant(['.'])
    length = 1000
    print("\nCorrectly spelled (T = 1.0):", correctly_spelled(generate_text_temperature(start, length, self.model, basic_encoder, T = 1.0), spelling_dictionary))
    print("Correctly spelled (theta = 1.0):", correctly_spelled(generate_text_nucleus(start, length, self.model, basic_encoder, theta = 1.0), spelling_dictionary))

In [None]:
model = LSTM(K = configs["K"], m = configs["m"])
model.compile(optimizer = tf.optimizers.Adam(learning_rate = configs["learning_rate"]), loss = tf.losses.SparseCategoricalCrossentropy(from_logits = True))

checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath = checkpoint_prefix, save_weights_only = True)

history = model.fit(training_batches, epochs = configs["epochs"], callbacks = [checkpoint_callback])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
"""wandb.init(
        project="ProjectDD2424",
        config=configs)

config=wandb.config

model = LSTM(K = config.K, m = config.m)
model.compile(optimizer = tf.optimizers.Adam(learning_rate = config.learning_rate), loss = tf.losses.SparseCategoricalCrossentropy(from_logits = True))

checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath = checkpoint_prefix, save_weights_only = True)

history = model.fit(training_batches, epochs = config.epochs, callbacks = [checkpoint_callback, SpellChecker(),wandb.keras.WandbCallback()], validation_data = validation_batches)"""

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

Correctly spelled (T = 1.0): 0.07142857142857142
Correctly spelled (T = 0.75): 0.1674641148325359
Correctly spelled (T = 0.5): 0.24705882352941178
Correctly spelled (theta = 1.0): 0.09333333333333334
Correctly spelled (theta = 0.75): 0.09696969696969697


  saving_api.save_model(
[34m[1mwandb[0m: [32m[41mERROR[0m Can't save model in the h5py format. The model will be saved as as an W&B Artifact in the 'tf' format.


Correctly spelled (theta = 0.5): 0.23983739837398374


[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20240514_131749-wnrjtgd1/files/model-best)... Done. 0.0s




In [30]:
start = '.'
length = 1000
T = 1.0

text = generate_text_temperature(start, length, model, encoder, embedder, T)
print("Correctly spelled:", correctly_spelled(text, spelling_dictionary))
print(text)

ValueError: in user code:

    File "/var/folders/n8/_26t9s310m58_t5lx9v_qd3m0000gn/T/ipykernel_21204/3002542459.py", line 24, in __call__  *
        A = tf.linalg.matvec(self.W, H) + tf.linalg.matvec(self.U, X[t,:]) + self.b

    ValueError: slice index 1 of dimension 0 out of bounds. for '{{node strided_slice_1}} = StridedSlice[Index=DT_INT32, T=DT_FLOAT, begin_mask=2, ellipsis_mask=0, end_mask=2, new_axis_mask=0, shrink_axis_mask=1](X, strided_slice_1/stack, strided_slice_1/stack_1, strided_slice_1/stack_2)' with input shapes: [1,1,76], [2], [2], [2] and with computed input tensors: input[1] = <1 0>, input[2] = <2 0>, input[3] = <1 1>.


In [29]:
start = '.'
length = 1000
T = 1.0

text = generate_text_temperature(start, length, model, byte_pair_encoder, one_hot_embedder, T)
print("Correctly spelled:", correctly_spelled(text, spelling_dictionary))
print(text)

NameError: name 'byte_pair_encoder' is not defined