In [1]:
import os
os.chdir("..")

In [2]:
import torch
import tensorflow as tf
import numpy as np

In [3]:
shakespear_url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
filepath =  tf.keras.utils.get_file("shakespear.txt", shakespear_url)
with open(filepath) as fp:
    shakespear = fp.read()
    shakespear = shakespear.lower()

In [21]:
text_tokenisations = tf.keras.layers.TextVectorization(split='character', standardize='lower')
text_tokenisations.adapt([shakespear])
tokens = text_tokenisations([shakespear])[0]
tokens -= 2 # remove 0, 1 which are for padding and unknown chars resp
vocab_size = text_tokenisations.vocabulary_size() - 2
data_size = len(tokens)

In [88]:
def create_dataset(sequence, length, shuffle=False, seed=None, batch_size=32): 
    """
    Takes input tensors which are tokens created from text and creates x, y. 
    Where y is shifted by 1.
    sequence: input tensors
    length: context size (how many chars at a time do you want the model to see)
    """
    ds = tf.data.Dataset.from_tensor_slices(sequence) 
    ds = ds.window(length + 1, shift=1, drop_remainder=True) 
    ds = ds.flat_map(lambda window_ds: window_ds.batch(length + 1)) 
    if shuffle: 
        ds = ds.shuffle(buffer_size=100_000, seed=seed) 
    ds = ds.batch(batch_size) 
    return ds.map(lambda window: (window[:, :-1], window[:, 1:])).prefetch(1)

In [86]:
# 90:5:5 = train:test:validation
length = 64
tf.random.set_seed(42) # so that valid_set, test_test contain unseen data
train_set = create_dataset(tokens[:1_000_000], length=length, shuffle=True, seed=42) 
valid_set = create_dataset(tokens[1_000_000:1_060_000], length=length) 
test_set = create_dataset(tokens[1_060_000:], length=length)

In [None]:
model = tf.keras.Sequential([ 
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=16), 
    tf.keras.layers.GRU(128, return_sequences=True), 
    tf.keras.layers.Dense(vocab_size, activation="softmax") 
]) 
model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam", 
              metrics=["accuracy"])

model_ckpt = tf.keras.callbacks.ModelCheckpoint("my_shakespeare_model.keras", monitor="val_accuracy", save_best_only=True) 
history = model.fit(train_set, validation_data=valid_set, epochs=1, callbacks=[model_ckpt])

In [76]:
shakespeare_model = tf.keras.Sequential([
    text_vec_layer, 
    tf.keras.layers.Lambda(lambda X: X - 2),  # no <PAD> or <UNK> tokens 
    model 
])

y_proba = shakespeare_model.predict(["To be or not to b"])[0, -1] 
y_pred = tf.argmax(y_proba)  # choose the most probable character ID 
text_vec_layer.get_vocabulary()[y_pred + 2] 

<_PrefetchDataset element_spec=(TensorSpec(shape=(None, None), dtype=tf.int64, name=None), TensorSpec(shape=(None, None), dtype=tf.int64, name=None))>

In [134]:
def create_dataset(sequence, length, shuffle=False, seed=None, batch_size=32): 
    """
    Takes input tensors which are tokens created from text and creates x, y. 
    Where y is shifted by 1.
    sequence: input tensors
    length: context size (how many chars at a time do you want the model to see)
    """
    ds = tf.data.Dataset.from_tensor_slices(sequence) 
    ds = ds.window(length + 1, shift=1, drop_remainder=True) 
    ds = ds.flat_map(lambda window_ds: window_ds.batch(length + 1)) 
    if shuffle: 
        ds = ds.shuffle(buffer_size=100_000, seed=seed) 
    ds = ds.batch(batch_size) 
    return ds.map(lambda window: (window[:, :-1], window[:, 1:])).prefetch(1)

In [137]:
import torch
from torch.utils.data import Dataset
 

class PicoGPTDataset(Dataset):
    def __init__(self, text, tokenizer, context_size, stride):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.target_ids = []
 
        token_ids = tokenizer.encode(text)
 
        for i in range(0, len(token_ids) - context_size, stride):
            input_chunk = token_ids[i:i + context_size]
            target_chunk = token_ids[i + 1: i + context_size + 1]
            # requires_grad_(True) tells all of the input tensors should be used to calculate the gradients
            # it is set to False here by default
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
 
    def __len__(self):
        return len(self.input_ids)
 
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [None]:
model = tf.keras.Sequential([ 
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=16), 
    tf.keras.layers.GRU(128, return_sequences=True), 
    tf.keras.layers.Dense(vocab_size, activation="softmax") 
]) 
model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam", 
              metrics=["accuracy"])

model_ckpt = tf.keras.callbacks.ModelCheckpoint("my_shakespeare_model.keras", monitor="val_accuracy", save_best_only=True) 
history = model.fit(train_set, validation_data=valid_set, epochs=1, callbacks=[model_ckpt])

NameError: name 'train_set' is not defined