<a href="https://colab.research.google.com/github/annamaartensson/dd2424project/blob/issue%2F14/models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
import tensorflow as tf
import numpy as np
import pathlib
import os
import platform
import re

print(platform.python_version())
print(tf.__version__)

3.10.12
2.15.0


In [None]:
!pip install -qq -U wandb

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.7/6.7 MB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m277.3/277.3 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import wandb
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [23]:
def fetch_data():
  cache_dir = './tmp'
  dataset_file_name = 'pg31100.txt'
  dataset_file_origin = 'https://www.gutenberg.org/cache/epub/31100/pg31100.txt'
  dataset_file_path = tf.keras.utils.get_file(fname = dataset_file_name, origin = dataset_file_origin, cache_dir=pathlib.Path(cache_dir).absolute())
  text = open(dataset_file_path, mode='r').read()
  persuasion = text[1437:468297]
  northanger_abbey = text[468297:901707]
  mansfield_park = text[901707:1784972]
  emma = text[1784972:2668012]
  lady_susan = text[2668012:2795312]
  love_and_friendship = text[2795312:2980261]
  pride_and_predjudice = text[2980261:3665048]
  sense_and_sensibility = text[3682008:4355100]
  full_text = text[1437:4355100]
  books = [persuasion, northanger_abbey, mansfield_park, emma, lady_susan, love_and_friendship, pride_and_predjudice, sense_and_sensibility]
  return books

In [None]:
"""def get_vocabulary(text):
  vocabulary = sorted(set(text))
  char_to_ind = tf.keras.layers.StringLookup(vocabulary = list(vocabulary), mask_token = None)
  ind_to_char = tf.keras.layers.StringLookup(vocabulary = char_to_ind.get_vocabulary(), invert = True, mask_token = None)
  return vocabulary, char_to_ind, ind_to_char"""

In [95]:
class BasicEncoder:

  def __init__(self, text):
    self.vocabulary = sorted(set(text))
    self.ind_to_token = list(self.vocabulary)
    self.ind_to_token.insert(0, '[UNK]')
    self.token_to_ind = {self.ind_to_token[i] : i for i in range(len(self.ind_to_token))}

  def get_size(self):
    return len(self.ind_to_token)

  def text_to_inds(self, text):
    inds = []
    for c in text:
      if c in self.token_to_ind:
        inds.append(self.token_to_ind[c])
      else:
        inds.append(self.token_to_ind['[UNK]'])
    return inds

class BytePairEncoder(BasicEncoder):

  def __init__(self, text, target_size):
    super().__init__(text)
    self.__expand_vocabulary(text, target_size)

  def __merge_pairs(self, tokens, pair, val):
    merged_tokens = []
    i = 0
    while i < len(tokens):
      if tokens[i] == pair[0] and i < len(tokens)-1 and tokens[i+1] == pair[1]:
        merged_tokens.append(val)
        i += 2
      else:
        merged_tokens.append(tokens[i])
        i += 1
    return merged_tokens

  def __get_pair_counts(self, tokens):
    counts = {}
    for i in range(len(tokens)-1):
      pair = tokens[i], tokens[i+1]
      if pair not in counts:
        counts[pair] = 1
      else:
        counts[pair] += 1
    return counts

  def __expand_vocabulary(self, text, target_size):
    self.merges = {}
    tokens = [self.token_to_ind[c] for c in text]
    while self.get_size() < target_size:
      counts = self.__get_pair_counts(tokens)
      best_pair = max(counts, key = counts.get)
      new_token = self.ind_to_token[best_pair[0]]+self.ind_to_token[best_pair[1]]
      new_val = len(self.ind_to_token)
      self.ind_to_token.append(new_token)
      self.token_to_ind[new_token] = new_val
      self.merges[best_pair] = new_val
      tokens = self.__merge_pairs(tokens, best_pair, new_val)

  def text_to_inds(self,text):
    inds = super.text_to_inds(text)
    found_merge = True
    while found_merge:
      merged_inds = []
      found_merge = False
      i = 0
      while i < len(inds):
        if i < len(inds)-1 and (inds[i], inds[i+1]) in self.merges:
          merged_inds.append(self.merges[(inds[i], inds[i+1])])
          found_merge = True
          i += 2
        else:
          merged_inds.append(inds[i])
          i += 1
      inds = merged_inds
    return inds

In [43]:
def batch_data(text, seq_length, encoder, batch_size = 1, buffer_size = 0):
  dataset = tf.data.Dataset.from_tensor_slices(encoder.text_to_inds(text))
  sequences = dataset.batch(seq_length+1, drop_remainder = True).map(lambda s : (s[:seq_length], s[1:]))
  if buffer_size > 0:
    sequences = sequences.shuffle(buffer_size)
  batches = sequences.batch(batch_size, drop_remainder = True).prefetch(tf.data.experimental.AUTOTUNE)
  return batches

In [26]:
def clean_text(text):
  lower = text.lower()
  no_spec = re.sub("\&|\[|\]|\_|!|\?|\*|\.|,|\(|\)|;|:|[0-9]+|\"|\'","", lower)
  no_enter = re.sub("\n|-"," ", no_spec)
  return no_enter.split()

In [27]:
def get_dictionary(text):
  dictionary = {w for w in clean_text(text)}
  return dictionary

In [28]:
def correctly_spelled(text, dictionary):
  count = 0
  words = clean_text(text)
  for w in clean_text(text):
    if w in dictionary:
      count += 1
  return count/len(words)

In [29]:
class RNN(tf.keras.Model):
  def __init__(self, K, m):
    super().__init__(self)
    self.onehot = tf.keras.layers.Embedding(K, K, embeddings_initializer = 'identity', trainable = False)
    self.rnn = tf.keras.layers.SimpleRNN(m, return_sequences = True, return_state = True)
    self.dense = tf.keras.layers.Dense(K)

  def call(self, inputs, states = None, return_state = False, training = False):
    x = inputs
    x = self.onehot(x, training = training)
    if states is None:
      states = self.rnn.get_initial_state(x)
    x, states = self.rnn(x, initial_state = states, training = training)
    x = self.dense(x, training = training)
    if return_state:
      return x, states
    else:
      return x

class LSTM(tf.keras.Model):
  def __init__(self, K, m):
    super().__init__(self)
    self.onehot = tf.keras.layers.Embedding(K, K, embeddings_initializer = 'identity', trainable = False)
    self.lstm = tf.keras.layers.LSTM(m, return_sequences = True, return_state = True)
    self.dense = tf.keras.layers.Dense(K)

  def call(self, inputs, states = None, return_state = False, training = False):
    x = inputs
    x = self.onehot(x, training = training)
    if states is None:
      states = self.lstm.get_initial_state(x)
    x, *states = self.lstm(x, initial_state = states, training = training)
    x = self.dense(x, training = training)
    if return_state:
      return x, states
    else:
      return x

class LSTM2(tf.keras.Model):
  def __init__(self, K, m):
    super().__init__(self)
    self.onehot = tf.keras.layers.Embedding(K, K, embeddings_initializer = 'identity', trainable = False)
    self.lstm1 = tf.keras.layers.LSTM(m, return_sequences = True, return_state = True)
    self.lstm2 = tf.keras.layers.LSTM(m, return_sequences = True, return_state = True)
    self.dense = tf.keras.layers.Dense(K)

  def call(self, inputs, states = None, return_state = False, training = False):
    x = inputs
    x = self.onehot(x, training = training)
    if states is None:
      states_1 = self.lstm1.get_initial_state(x)
      states_2 = states_1
    else:
      states_1 = states[0]
      states_2 = states[1]
    x, *states_1 = self.lstm1(x, initial_state = states_1, training = training)
    x, *states_2 = self.lstm2(x, initial_state = states_2, training = training)
    x = self.dense(x, training = training)
    if return_state:
      return x, [states_1, states_2]
    else:
      return x

In [104]:
def generate_text_temperature(start, length, model, encoder, T = 1.0):
  text = []
  states = None
  start = np.expand_dims(encoder.text_to_inds(start), axis = 0)
  for i in range(length):
    logits, states = model(inputs = start, states = states, return_state = True)
    logits = logits[:, -1, :]/T
    pred = tf.random.categorical(logits, num_samples = 1)
    start = pred
    text.append(encoder.ind_to_token[tf.squeeze(pred).numpy()])
  return "".join(text)

def generate_text_nucleus(start, length, model, encoder, theta = 1.0):
  text = []
  states = None
  start = np.expand_dims(encoder.text_to_inds(start), axis = 0)
  for i in range(length):
    logits, states = model(inputs = start, states = states, return_state = True)
    logits = logits[:, -1, :]
    logits = tf.squeeze(logits, axis = 0)
    probs = tf.nn.softmax(logits)
    sorted_probs = tf.sort(probs, direction = 'DESCENDING')
    sorted_probs_sum = tf.math.cumsum(sorted_probs)
    thresh_inds = tf.where(sorted_probs_sum <= theta)
    if len(thresh_inds) > 0:
      thresh_ind = thresh_inds[-1, 0].numpy()
    else:
      thresh_ind = 0
    top_probs = tf.multiply(probs, tf.cast(probs >= sorted_probs[thresh_ind], 'float32'))/sorted_probs_sum[thresh_ind]
    pred = tf.random.categorical([tf.math.log(top_probs)], num_samples = 1)
    start = pred
    text.append(encoder.ind_to_token[tf.squeeze(pred).numpy()])
  return "".join(text)

In [105]:
books = fetch_data()

training_text = books[0] #+ books[1] + books[2] + books[3] + books[4] + books[5]
validation_text = books[6]
test_text = books[7]

basic_encoder = BasicEncoder(training_text)
#byte_pair_encoder = BytePairEncoder(training_text, 200)

spelling_dictionary = get_dictionary(training_text)

In [109]:
configs = dict(
    seq_length = 100,
    batch_size = 64,
    buffer_size = 10000,
    K = basic_encoder.get_size(),
    m = 256,
    epochs=20,
    learning_rate=0.001,
)

In [110]:
training_batches = batch_data(training_text, configs["seq_length"], basic_encoder, configs["batch_size"], configs["buffer_size"])
#validation_batches = batch_data(validation_text, configs["seq_length"], basic_encoder)

In [None]:
class SpellChecker(tf.keras.callbacks.Callback):

  def on_epoch_end(self, epoch, logs = None):
    start = tf.constant(['.'])
    length = 1000
    print("\nCorrectly spelled (T = 1.0):", correctly_spelled(generate_text_temperature(start, length, self.model, char_to_ind, ind_to_char, T = 1.0), dictionary))
    print("Correctly spelled (T = 0.75):", correctly_spelled(generate_text_temperature(start, length, self.model, char_to_ind, ind_to_char, T = 0.75), dictionary))
    print("Correctly spelled (T = 0.5):", correctly_spelled(generate_text_temperature(start, length, self.model, char_to_ind, ind_to_char, T = 0.5), dictionary))
    print("Correctly spelled (theta = 1.0):", correctly_spelled(generate_text_nucleus(start, length, self.model, char_to_ind, ind_to_char, theta = 1.0), dictionary))
    print("Correctly spelled (theta = 0.75):", correctly_spelled(generate_text_nucleus(start, length, self.model, char_to_ind, ind_to_char, theta = 0.75), dictionary))
    print("Correctly spelled (theta = 0.5):", correctly_spelled(generate_text_nucleus(start, length, self.model, char_to_ind, ind_to_char, theta = 0.5), dictionary))

In [111]:
model = LSTM(K = configs["K"], m = configs["m"])
model.compile(optimizer = tf.optimizers.Adam(learning_rate = configs["learning_rate"]), loss = tf.losses.SparseCategoricalCrossentropy(from_logits = True))

checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath = checkpoint_prefix, save_weights_only = True)

history = model.fit(training_batches, epochs = configs["epochs"], callbacks = [checkpoint_callback])

Epoch 1/20
Epoch 2/20
Epoch 3/20
 2/72 [..............................] - ETA: 26s - loss: 2.8228 

KeyboardInterrupt: 

In [None]:
"""wandb.init(
        project="ProjectDD2424",
        config=configs)

config=wandb.config

model = LSTM(K = config.K, m = config.m)
model.compile(optimizer = tf.optimizers.Adam(learning_rate = config.learning_rate), loss = tf.losses.SparseCategoricalCrossentropy(from_logits = True))

checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath = checkpoint_prefix, save_weights_only = True)

history = model.fit(training_batches, epochs = config.epochs, callbacks = [checkpoint_callback, SpellChecker(),wandb.keras.WandbCallback()], validation_data = validation_batches)"""

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

Correctly spelled (T = 1.0): 0.07142857142857142
Correctly spelled (T = 0.75): 0.1674641148325359
Correctly spelled (T = 0.5): 0.24705882352941178
Correctly spelled (theta = 1.0): 0.09333333333333334
Correctly spelled (theta = 0.75): 0.09696969696969697


  saving_api.save_model(
[34m[1mwandb[0m: [32m[41mERROR[0m Can't save model in the h5py format. The model will be saved as as an W&B Artifact in the 'tf' format.


Correctly spelled (theta = 0.5): 0.23983739837398374


[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20240514_131749-wnrjtgd1/files/model-best)... Done. 0.0s




In [113]:
start = '.'
length = 1000
T = 1.0

text = generate_text_temperature(start, length, model, basic_encoder, T)
print("Correctly spelled:", correctly_spelled(text, spelling_dictionary))
print(text)

Correctly spelled: 0.12234042553191489
enRk,, vs x ane oYh nerelr.ilg  n soe rae it  aueoseoo oes ycq tuu ed
 bpi  b ad pedt,ii

 -dcohn
hi
its  ohi,r d ced hii tseoinbe rle n rrhuiem aene.sser  tssoll
 chca Mua y einee acr trocththt yaes ondelgurarn tslhiels uc angn ir  fsra meaadmgoineo  oyl fordvoosl
ipith apee tgt, 
an
g eth wrt rta; sheaiiin ,hsbewehos ;ihlih .ehepw red te ae  a t
omere, ry rebe h bele f ps i s.vI ioenttndgeehoIo e s srderu ai
liededgeh tbertse;t"vr Iestueringe s ecblu  recfad fnoanv, 
hieryys sanwprlcooro trai l otge tf do sg a,tptoeg blhg thinilt ulse r'he ye the,rkes ouo, h dru uie itopobr  us hy
ru at  aiary dacl hnrv x. me a
Tm. l,wco.ve tsirf  ndBsasion yi wyMi rerab
h ll
eth awpp naqtsnnnfutee fastr onont  net a  odfomou boys ans ht he ;e tone sbAe;it nowy imle he y uoiws tar h,pngA"s ge
asaold  chinurn lmt naaym ee io  a,  fdelai o f et ieandloAeatd opoptbst u rihe  , in  wis
heeertb .mol, nnes,
chnr  nyetghas n  tik  t ld wed
h 
u ,ri s ne iroerthellrlnfe