In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import math
import urllib.request
import tiktoken

from torch.utils.data import Dataset, DataLoader, Subset
from typing import List

In [2]:
# Gets the verdict
url: str = "https://www.gutenberg.org/ebooks/67237.txt.utf-8"
urllib.request.urlretrieve(url, "books/the-verdict.txt")

# Initializes the string that will contain the loaded text above
raw_text: str = None

# Reads the loaded text
with open("books/the-verdict.txt", "r", encoding="utf-8") as file:
    raw_text = file.read()

# Logs the metadata of the text
print(f"Total number of characters: {len(raw_text)}")


Total number of characters: 357726


In [3]:
# Initializes the tokenizer
tokenizer: tiktoken.core.Encoding = tiktoken.get_encoding("gpt2")

# Runs it on the raw text
tokens = tokenizer.encode(raw_text, allowed_special={"<|endoftext|>"})
print(f"Number of tokens: {len(tokens)}")
print(f"Vocabulary size: {tokenizer.n_vocab}")

Number of tokens: 97421
Vocabulary size: 50257


In [4]:
# The below parameters define the shape of the embedding layer
vocabulary_size: int = tokenizer.n_vocab # => is the input size of the embedding layer, ie, the size of the vocabulary
token_embedding_dim: int = 256 # => is the output size of the embedding layer

# The below parameters define the shape of the positional embedding layer
# The positional embedding layer is of shape context_size x output_dim
context_length: int = 10
output_dim: int = token_embedding_dim # => is the size of the output of the positional embedding,
                                      # that we want to be equal to the token_embedding_dim as this will be the input to the model

# The size of the sequence to be generated
max_new_tokens: int = 15

# The stride
stride: int = 1

# Batch size
batch_size: int = 64

print(f"Vocabulary size set to: {vocabulary_size}")

Vocabulary size set to: 50257


In [5]:
# Example of using context with the tokenizer
print(f"The first 100 characters of the raw text:\n\n\"{raw_text[: 180]}\"\n")
print("Example of text generating with context_size slicing:")
for i in range(1, 4+1):
    context: int = tokens[: i]
    desired: int = tokens[i]
    print(tokenizer.decode(context), "---->", tokenizer.decode([desired]))

The first 100 characters of the raw text:

"﻿The Project Gutenberg eBook of An open verdict
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost "

Example of text generating with context_size slicing:
� ----> �
� ----> �
﻿ ----> The
﻿The ---->  Project


In [6]:
class GPTDatasetV1(Dataset):
    def __init__(self, text, tokenizer, context_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(text)

        for i in range(0, len(token_ids) - context_length, stride):
            input_chunk = token_ids[i:i + context_length]
            self.input_ids.append(torch.tensor(input_chunk))

            temp_id = token_ids[i + context_length]
            target_tensor = torch.zeros([50257])
            target_tensor[temp_id] = 1.0
            self.target_ids.append(target_tensor)

            # target_chunk = token_ids[i + 1: i + max_length + 1]            
            # self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [7]:
def create_dataloader_v1(txt, batch_size=batch_size, context_length=context_length,
    stride=stride, shuffle=True, drop_last=True,
    num_workers=0):

    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(txt, tokenizer, context_length, stride)
    
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers)
    
    return dataloader

In [8]:
class LSTMText(nn.Module):
    def __init__(
            self,
            input_size: int,
            hidden_size: int
    ):
        '''
        input_size: int -> size of the embedding dimension in the transformer parlance
        hidden_size: int -> size of the hidden state
        
        '''
        super().__init__()
        self.input_size: int = input_size
        self.hidden_size: int = hidden_size

        # Input gate (I):
        # I_t = sigma(X_t.W_xi + H_t-1.W_hi + b_i)
        # Where:
        # X_t is X(the sequence) at time t
        # H_t-1 is H(the hidden state) at time t-1
        # W_xi is the weight matrix of X to I gate
        self.W_xi: nn.Parameter = nn.Parameter(torch.Tensor(self.input_size, self.hidden_size))

        # W_hi is the weight matrix of h to the I gate
        self.W_hi: torch.Tensor = nn.Parameter(torch.Tensor(self.hidden_size, self.hidden_size))

        # self.b_i is the bias to the I gate
        self.b_i: torch.Tensor = nn.Parameter(torch.Tensor( self.hidden_size))


        # Forget gate (F):
        # F_t = sigma(X_t.W_xf + H_t-1.W_hf + b_f)
        # Where:
        # X_t is X(the sequence) at time t
        # H_t-1 is H(the hidden state) at time t-1
        # W_xf is the weight matrix of X to F gate
        self.W_xf: nn.Parameter = nn.Parameter(torch.Tensor(self.input_size, self.hidden_size))

        # W_hf is the weight matrix of h to the F gate
        self.W_hf: torch.Tensor = nn.Parameter(torch.Tensor(self.hidden_size, self.hidden_size))

        # self.b_f is the bias to the F gate
        self.b_f: torch.Tensor = nn.Parameter(torch.Tensor( self.hidden_size))


        # Output gate (O):
        # O_t = sigma(X_t.W_xo + H_t-1.W_ho + b_o)
        # Where:
        # X_t is X(the sequence) at time t
        # H_t-1 is H(the hidden state) at time t-1
        # W_xo is the weight matrix of X to O gate
        self.W_xo: nn.Parameter = nn.Parameter(torch.Tensor(self.input_size, self.hidden_size))

        # W_ho is the weight matrix of h to the O gate
        self.W_ho: torch.Tensor = nn.Parameter(torch.Tensor(self.hidden_size, self.hidden_size))

        # self.b_o is the bias to the O gate
        self.b_o: torch.Tensor = nn.Parameter(torch.Tensor( self.hidden_size))

        
        # Cell (C):
        # C_t = sigma(X_t.W_xc + H_t-1.W_hc + b_c)
        # Where:
        # X_t is X(the sequence) at time t
        # H_t-1 is H(the hidden state) at time t-1
        # W_xc is the weight matrix of X to C cell
        self.W_xc: nn.Parameter = nn.Parameter(torch.Tensor(self.input_size, self.hidden_size))

        # W_ho is the weight matrix of h to the C cell
        self.W_hc: torch.Tensor = nn.Parameter(torch.Tensor(self.hidden_size, self.hidden_size))

        # self.b_o is the bias to the C cell
        self.b_c: torch.Tensor = nn.Parameter(torch.Tensor( self.hidden_size))

        # Initializes all weights 
        self.initialize_weights()

    def __repr__(self):
        repr: str = f"LSTMText(input_size={self.input_size}, hidden_size={self.hidden_size})"
        return repr

    def initialize_weights(self):
        stdev: float = 1.0 / math.sqrt(self.hidden_size)
        for weight in self.parameters():
            weight.data.uniform_(-stdev, stdev)

    def forward(self, X: torch.Tensor, states: torch.Tensor = None):
        """
        assumes x.shape represents (batch_size, sequence_size, embedding_dimension)
        """
        bs, sequence_size, input_size = X.size()
        
        if input_size != self.input_size:
            raise ValueError(f"Input shape: {input_size} is not equal to model input size: {self.input_size}")

        if states is None:
            H_t, C_t = (
                torch.zeros(bs, self.hidden_size).to(device=X.device),
                torch.zeros(bs, self.hidden_size).to(device=X.device)
            )
        else:
            H_t, C_t = states

        outputs = []
        for t in range(sequence_size):
            x = X[:, t, :]
            # I is the input gate
            I_t = torch.sigmoid(torch.matmul(x, self.W_xi) + torch.matmul(H_t, self.W_hi) + self.b_i)

            # F is the forget state
            F_t = torch.sigmoid(torch.matmul(x, self.W_xf) + torch.matmul(H_t, self.W_hf) + self.b_f)

            # O is the output state
            O_t = torch.sigmoid(torch.matmul(x, self.W_xo) + torch.matmul(H_t, self.W_ho) + self.b_o)

            # C_t, the memory (C)ell is:
            # C_t = F(.)C_t-1 + I_t(.)C_temp
            # C_temp = tanh(X_t.W_xc + H_t-1.W_hc + b_c)
            C_temp = torch.tanh(torch.matmul(x, self.W_xc) + torch.matmul(H_t, self.W_hc) + self.b_c)
            C_t = F_t * C_t + I_t * C_temp
            H_t = O_t * torch.tanh(C_t)
            outputs.append(H_t)

        result = torch.cat([outputs[-1]], dim=0)
        return result, (H_t, C_t)


In [9]:
class RNNModel(nn.Module):
    def __init__(self,
                 input_size: int,
                 hidden_size: int,
                 vocabulary_size: int,
                 context_length: int,
                 output_dim: int,
                 out_features: int
                 ):
        super().__init__()

        # The parameters
        self.vocabulary_size: int = vocabulary_size
        self.hidden_size: int = hidden_size
        self.input_size: int = input_size
        self.context_length: int = context_length
        self.output_dim: int = output_dim
        self.output_features: int = out_features

        # the embedding layer
        self.token_embedding_layer: nn.Embedding = nn.Embedding(num_embeddings=self.vocabulary_size, embedding_dim=output_dim)

        # the positional embedding layer
        self.pos_embedding_layer: nn.Embedding = nn.Embedding(num_embeddings=self.context_length, embedding_dim=self.hidden_size)

        # the rnn (LSTM or GRU) rnn
        self.rnn_layer: LSTMText = LSTMText(input_size=self.input_size, hidden_size=hidden_size)

        # Linear layer to generate the output
        self.output_layer: nn.Linear = nn.Linear(in_features=input_size, out_features=out_features)

    def __repr__(self):
        repr: str = f'''RNNModel(input_size={self.input_size},
        hidden_size={self.hidden_size},
        vocabulary_size={self.vocabulary_size},
        context_length={self.context_length},
        output_dim={self.output_dim},
        out_features={self.output_features},
        (token_embedding_layer)={self.token_embedding_layer}),
        (pos_embedding_layer)={self.pos_embedding_layer},
        (rnn_layer)={self.rnn_layer},
        (output_layer)={self.output_layer}
        )
        '''
        return repr
    
    def forward(self, X: torch.Tensor):
        token_embeddings_ = self.token_embedding_layer(X)
        pos_embeddings_ = self.pos_embedding_layer(torch.arange(self.context_length))
        input_embeddings_ = token_embeddings_ + pos_embeddings_

        output_, _ = self.rnn_layer(input_embeddings_)
        output_ = self.output_layer(output_)
        
        return output_


In [10]:
device: str = ["cuda" if torch.cuda.is_available() else "cpu"]
device

['cuda']

In [11]:
model = RNNModel(input_size=token_embedding_dim,
                 hidden_size=output_dim,
                 vocabulary_size=vocabulary_size,
                 context_length=context_length,
                 output_dim=output_dim,
                 out_features=vocabulary_size)
model.eval()

total_params: int = sum([p.numel() for p in model.parameters()])
print(f"Total number of parameters: {total_params}")

total_size_mb: int = (total_params * 4) / (1024 * 1024)
print("Total memory footprint required to"
          f" run the model: {total_size_mb:,.2f} MB")

Total number of parameters: 26309713
Total memory footprint required to run the model: 100.36 MB


In [12]:
def pad_or_trim_sequence(input: torch.Tensor, context_length: int):
    input = input[:, :context_length]
    output = F.pad(input, (0, context_length - input.shape[1]), "constant", 0)
    return output

In [22]:
def generate_text_simple(model,
                         idx: torch.Tensor,
                         max_new_tokens,
                         context_size):
    
    if idx.shape[-1] < context_size:
        idx = pad_or_trim_sequence(idx, context_length=context_size)

    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            output_ = model(idx_cond)
            print(f"Output shape: {output_.shape}")
        
        output_ = torch.softmax(output_, dim=-1)
        output_ = torch.argmax(output_, dim=-1)
        idx = torch.cat((idx, output_.unsqueeze(0)), dim=1)
    return idx

In [14]:
def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0)
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0)
    return tokenizer.decode(flat.tolist())

In [15]:
start_context = "He was always thinking" # what the estate would be if those mortgages could but be paid off.
tokenizer = tiktoken.get_encoding("gpt2")

In [23]:
token_ids = generate_text_simple(
    model=model,
    idx=text_to_token_ids(start_context, tokenizer),
    max_new_tokens=max_new_tokens,
    context_size=context_length
)
print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

Output shape: torch.Size([1, 50257])
Output shape: torch.Size([1, 50257])
Output shape: torch.Size([1, 50257])
Output shape: torch.Size([1, 50257])
Output shape: torch.Size([1, 50257])
Output shape: torch.Size([1, 50257])
Output shape: torch.Size([1, 50257])
Output shape: torch.Size([1, 50257])
Output shape: torch.Size([1, 50257])
Output shape: torch.Size([1, 50257])
Output shape: torch.Size([1, 50257])
Output shape: torch.Size([1, 50257])
Output shape: torch.Size([1, 50257])
Output shape: torch.Size([1, 50257])
Output shape: torch.Size([1, 50257])
Output text:
 He was always thinking!!!!!! surgeon60female Ribbon unusually &&ades Gold TeeörTrain Bullet Casey Ryan Ryan


In [17]:

# 1.2 Generator to enabling split dataset into train and validation subsets
generator1: torch.Generator = torch.Generator().manual_seed(918)

dataset = GPTDatasetV1(raw_text, tokenizer, context_length, stride)

# 1.3 Creates a list with both subsets, 90% training, 10% evaluation
datasets: List[Subset] = torch.utils.data.random_split(
    dataset,
    [0.9, 0.1],
    generator=generator1
)

# 1.4 Assigns train and validation datasets accordingly
train_dataset: Subset = datasets[0]
validation_dataset: Subset = datasets[1]

train_dataloader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        drop_last=True,
        num_workers=2
    )

val_dataloader = DataLoader(
        validation_dataset,
        batch_size=batch_size,
        shuffle=True,
        drop_last=True,
        num_workers=2
    )

In [18]:
num_epochs: int = 10

In [19]:
optimizer = optim.Adam(model.parameters(), lr=0.0005)
evaluator = nn.CrossEntropyLoss()

In [20]:
def performance(X: torch.Tensor, y: torch.Tensor):
    output = model(X)
    perf = torch.nn.functional.cross_entropy(
            output,
            y
        )
    
    return perf

In [21]:
counter = 0
model.train()
for epoch in range(num_epochs):

    for input_batch, target_batch in train_dataloader:
        optimizer.zero_grad()

        output = model(input_batch)
        loss: torch.Tensor = evaluator(output, target_batch)
        loss.backward()

        model.eval()
        perf = performance(input_batch, target_batch)
        model.train()

        if counter % 1000 == 0:
            print(f"Performance: {perf}")
        counter += 1

    print(f"End of epoch: {epoch}")
    print("Output text:\n", token_ids_to_text(token_ids, tokenizer))
    
    token_ids = generate_text_simple(
    model=model,
    idx=text_to_token_ids(start_context, tokenizer),
    max_new_tokens=max_new_tokens,
    context_size=context_length
)

Performance: 10.83252239227295
Performance: 10.843337059020996
End of epoch: 0
Output text:
 He was always thinking!!!!!! surgeon60female Ribbon unusually &&ades Gold TeeörTrain Bullet Casey Ryan Ryan
Performance: 10.830931663513184
End of epoch: 1
Output text:
 He was always thinking!!!!!! surgeon60female Ribbon unusually &&ades Gold TeeörTrain Bullet Casey Ryan Ryan
Performance: 10.853130340576172
Performance: 10.855406761169434
End of epoch: 2
Output text:
 He was always thinking!!!!!! surgeon60female Ribbon unusually &&ades Gold TeeörTrain Bullet Casey Ryan Ryan


KeyboardInterrupt: 