Generatively Pretrained Transformer (GPT)
https://arxiv.org/pdf/1706.03762

<h3> 1. Import Libraries </h3>

In [1]:
from platform import python_version

print(python_version())

3.9.16


In [2]:
import subprocess

# list of modules to install and their corresponding names
modules = {'tensorflow': 'tf', 'numpy': 'np', 'pandas': 'pd', 'requests': 'requests',
           'tiktoken': 'tiktoken', 'openai': 'openai', 'torch': 'torch', 'torch.nn': 'nn', 'matplotlib.pyplot': 'plt', 'torch.nn.functional': 'F'}

# iterate over the modules and install them
for module, name in modules.items():
    try:
        exec(f"import {module} as {name}")
    except ModuleNotFoundError as error:
        print(f"{error.name} module not found. Installing {error.name}...")
        subprocess.check_call(["pip", "install", error.name])
    else:
        continue

<b> Check for available GPUs for Training </b> 

In [3]:
# Check for GPUs
available_gpus = [torch.cuda.device(i) for i in range(torch.cuda.device_count())]
available_gpus

[]

<h3> 2. Import Data </h3> 

In [4]:
import urllib.request

def load_text_from_url(url):
    response = urllib.request.urlopen(url)
    text = response.read().decode('utf-8')
    return text

url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
text = load_text_from_url(url)

<h3> 3. Basic Data Exploration </h3> 

<h3> 4. Create Basic Tokenization Functions and Encode Data </h3> 

<b> Encoding 1: Character Level </b> 

In [5]:
# Create a mapping from characters to integers
#stoi = { ch:i for i,ch in enumerate(chars) } # This creates a lookup table for converting a character to an int
#itos = { i:ch for i,ch in enumerate(chars) } # This creates a lookup table for converting an int to a char
#encode = lambda s: [stoi[c] for c in s]
#decode = lambda l: ''.join(itos[i] for i in l)
#print(encode("Hello World"))
#print(decode(encode("Hello World")))

<b> Encoding 2: Subword OpenAI GPT4 Dictionary 'cl100k_base'</b>

In [6]:
# open ai tokenizer function goes here (tiktoken)
gpt2enc = tiktoken.get_encoding('gpt2')
# gpt4enc = tiktoken.get_encoding('cl100k_base')
vocab_size=gpt2enc.n_vocab
print(f"The length of the gpt2 encoding dictionary is {vocab_size}")
print(gpt2enc.encode("Hello World"))
print(gpt2enc.decode(gpt2enc.encode("Hello World")))

The length of the gpt2 encoding dictionary is 50257
[15496, 2159]
Hello World


In [7]:
# Sentence piece from google

<h3> 5. Encode Dataset </h3>

In [8]:
data = torch.tensor(gpt2enc.encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:100])

torch.Size([338025]) torch.int64
tensor([ 5962, 22307,    25,   198,  8421,   356,  5120,   597,  2252,    11,
         3285,   502,  2740,    13,   198,   198,  3237,    25,   198,  5248,
          461,    11,  2740,    13,   198,   198,  5962, 22307,    25,   198,
         1639,   389,   477, 12939,  2138,   284,  4656,   621,   284,  1145,
          680,    30,   198,   198,  3237,    25,   198,  4965,  5634,    13,
        12939,    13,   198,   198,  5962, 22307,    25,   198,  5962,    11,
          345,   760,   327,  1872,   385,  1526, 28599,   318,  4039,  4472,
          284,   262,   661,    13,   198,   198,  3237,    25,   198,  1135,
          760,   470,    11,   356,   760,   470,    13,   198,   198,  5962,
        22307,    25,   198,  5756,   514,  1494,   683,    11,   290,   356])


<h3> 6. Create Training and Validation Datasets </h3> 

In [9]:
def train_validate_split(dataset,cutoff):
    n = int(cutoff*len(dataset))
    train_data = dataset[:n]
    val_data = dataset[n:]
    return train_data, val_data

In [10]:
cutoff = 0.90
train_data, val_data = train_validate_split(data,cutoff)

<h3> 7. Define Basic Bigram Transformer Model </h3> 

A Bigram transformer is a type of language model that uses the concept of bigrams to predict the next word in a sequence of words. Bigrams are pairs of adjacent words in a sentence or text. For example, in the sentence "The quick brown fox jumps over the lazy dog," the bigrams are "The quick", "quick brown", "brown fox", "fox jumps", "jumps over", "over the", and "the lazy".

A Bigram transformer learns the probability distribution of bigrams from a given text corpus, and uses this knowledge to predict the most likely next word in a sentence based on the previous word. It is a simple and effective approach for language modeling that can be used for a variety of natural language processing tasks, such as machine translation, speech recognition, and text generation.

In [11]:
# hyperparameters
batch_size = 64 # how many independent sequences will we process in parallel?
block_size = 8 #256 # what is the maximum context length for predictions?
max_iters = 3000
eval_interval = 50
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2
patience = 3
plateau = 0.005

torch.manual_seed(1337)

<torch._C.Generator at 0x1f12b4faf90>

<b> Data Batching Function </b> 

This function is used for generating a mini-batch of data from the specified dataset (training or validation) to be used during the training or evaluation process of a machine learning model. It randomly selects batch_size starting indices, then creates input sequences (x) and their corresponding target sequences (y) of length block_size. The input and target sequences are shifted by one position, making the model predict the next element in the sequence.

In [12]:
def get_batch(split):
    """
    This function generates a small batch of data consisting of inputs (x) and targets (y) from the training or validation dataset.
    
    Args:
    split (str): A string that indicates whether to use the training dataset or the validation dataset. It accepts two values, 'train' or 'val'.
    
    Returns:
    x (torch.Tensor): A tensor containing the input sequences for the mini-batch. The shape of the tensor is (batch_size, block_size).
    y (torch.Tensor): A tensor containing the target sequences for the mini-batch. The shape of the tensor is (batch_size, block_size).
    
    Global Variables:
    train_data (torch.Tensor): A tensor containing the training dataset.
    val_data (torch.Tensor): A tensor containing the validation dataset.
    batch_size (int): The number of samples in a single batch.
    block_size (int): The length of each input and target sequence.
    """
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

<b> Loss Estimation Function </b> 

<h4>Function Name:</h4> 
<p><code>estimate_loss()</code></p>
<h4>Input:</h4> 
<p>None</p>
<h4>Output:</h4> 
<p>A Python dictionary containing the average loss over the training and validation datasets.</p>
<h4>Functionality:</h4> 
<p>This function estimates the average loss over the training and validation datasets for a PyTorch model.</p>
<h4>Steps:</h4> 
<ol>
    <li>Initializes an empty Python dictionary <code>out</code> to store the average loss for each dataset.</li>
    <li>Sets the model to evaluation mode using <code>model.eval()</code>.</li>
    <li>Loops over the two splits of the dataset: training and validation.</li>
    <li>Initializes a PyTorch tensor <code>losses</code> with shape <code>(eval_iters,)</code> to store the loss for each evaluation iteration.</li>
    <li>Loops over <code>eval_iters</code> number of iterations and retrieves a batch of data for the given split using <code>get_batch(split)</code>.</li>
    <li>Passes the input data <code>X</code> and target <code>Y</code> through the PyTorch model <code>model</code> to get the logits and the loss for the batch.</li>
    <li>Stores the loss value as a scalar in the <code>losses</code> tensor at the index <code>k</code>.</li>
    <li>Computes the mean of the <code>losses</code> tensor and stores the result in the <code>out</code> dictionary with the key being the split.</li>
    <li>Sets the model back to training mode using <code>model.train()</code>.</li>
    <li>Returns the <code>out</code> dictionary containing the average loss for each dataset.</li>
</ol>
<p><em>Note: It is assumed that the variables <code>model</code>, <code>eval_iters</code>, and <code>get_batch()</code> are defined and accessible within the scope of this function.</em></p>

In [13]:
@torch.no_grad() # tells pytorch we will not call "backward" on the function (back propagation)
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out


<h4><code>class Head(nn.Module):</code></h4>
<p>
In natural language processing, self-attention is a technique used in deep learning models to help the model understand the context of a sentence or a document. It works by assigning weights to each word in the input text, based on the relevance of that word to the other words in the text. These weights help the model to focus on the most important words and ignore the less important ones.

A head of self-attention is a single part of the self-attention mechanism. Think of it like a small flashlight that helps the model to focus on a specific part of the input text. Each head of self-attention works independently to assign weights to the words in the input text, and then combines the weighted representation of the input text to create a more accurate understanding of the context.

Just like how you might use multiple flashlights to illuminate different parts of a room, deep learning models can use multiple heads of self-attention to better understand different aspects of the input text. By combining the information from multiple heads of self-attention, the model can create a more complete and accurate representation of the input text, which helps it perform better on natural language processing tasks like language translation and text classification.</p>
<h4><code>Parameters:</code></h4>
<ul>
<li><code>head_size (int)</code>: The size of the subspace for key, query, and value tensors.</li>
</ul>
<h4><code>Attributes:</code></h4>
<ul>
<li><code>key (nn.Linear)</code>: Linear layer for projecting the input tensor into the key subspace.</li>
<li><code>query (nn.Linear)</code>: Linear layer for projecting the input tensor into the query subspace.</li>
<li><code>value (nn.Linear)</code>: Linear layer for projecting the input tensor into the value subspace.</li>
<li><code>tril (torch.Tensor)</code>: Lower-triangular matrix of ones, with the same size as the sequence length, used to mask out attention scores corresponding to positions that have not been seen yet during training.</li>
<li><code>dropout (nn.Dropout)</code>: Dropout layer to prevent overfitting.</li>
</ul>
<h4><code>Inputs:</code></h4>
<ul>
<li><code>x (torch.Tensor)</code>: Input tensor of shape `(B, T, C)`, where `B` is the batch size, `T` is the sequence length, and `C` is the hidden size of the input tensor.</li>
</ul>
<h4><code>Outputs:</code></h4>
<ul>
<li><code>out (torch.Tensor)</code>: Output tensor of the same shape as the input tensor.</li>
</ul>
<h4><code>Methods:</code></h4>
<ul>
<li><code>forward(x: torch.Tensor) -&gt; torch.Tensor:</code> Computes the self-attention of the input tensor `x`, and returns the output tensor.</li>
</ul>

In [14]:
# Model copied from https://github.com/karpathy/nanoGPT
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

<div>
<h3><code>class MultiHeadAttention(nn.Module):</code></h3>
<p>The MultiHeadAttention module is an implementation of the multi-head attention mechanism used in transformer models. The idea behind multi-head attention is to split the input tensor into multiple "heads", and apply the self-attention mechanism to each head independently. This allows the model to attend to different parts of the input tensor with different sets of weights, which can help the model learn more diverse and meaningful representations.

The MultiHeadAttention module takes an input tensor x of shape (B, T, C), where B is the batch size, T is the sequence length, and C is the hidden size of the input tensor. The num_heads parameter specifies the number of attention heads, and the head_size parameter specifies the size of each attention head.

In the __init__ method, the MultiHeadAttention module creates a list of num_heads Head modules, each with a size of head_size. Each Head module computes the attention scores and corresponding weighted values for a single attention head. The MultiHeadAttention module then applies a linear projection to the concatenated output of the attention heads to transform it back into the original hidden size C. Finally, a dropout layer is applied to prevent overfitting.

In the forward method, the MultiHeadAttention module passes the input tensor x through each Head module, concatenates the outputs along the last dimension, and applies the linear projection and dropout layers to the concatenated output. The resulting tensor has the same shape as the input tensor.

Overall, the MultiHeadAttention module allows the transformer model to attend to multiple parts of the input tensor at once, and learn more complex relationships between different parts of the input. This can lead to better performance on tasks that require modeling complex relationships between different parts of the input, such as natural language understanding or image processing.</p>
<h4><code>Parameters:</code></h4>
<ul>
<li><code>num_heads (int)</code>: The number of heads in the multi-head attention.</li>
<li><code>head_size (int)</code>: The size of each head in the multi-head attention.</li>
</ul>
<h4><code>Attributes:</code></h4>
<ul>
<li><code>heads (nn.ModuleList)</code>: Module list containing the individual heads of self-attention.</li>
<li><code>proj (nn.Linear)</code>: Linear layer for projecting the concatenated output of the individual attention heads into the original hidden size.</li>
<li><code>dropout (nn.Dropout)</code>: Dropout layer to prevent overfitting.</li>
</ul>
<h4><code>Inputs:</code></h4>
<ul>
<li><code>x (torch.Tensor)</code>: Input tensor of shape `(B, T, C)`, where `B` is the batch size, `T` is the sequence length, and `C` is the hidden size of the input tensor.</li>
</ul>
<h4><code>Outputs:</code></h4>
<ul>
<li><code>out (torch.Tensor)</code>: Output tensor of the same shape as the input tensor.</li>
</ul>
<h4><code>Methods:</code></h4>
<ul>
<li><code>forward(x: torch.Tensor) -&gt; torch.Tensor:</code> Computes the multi-head self-attention of the input tensor `x`, and returns the output tensor.</li>
</ul>
</div>

In [15]:
# Model copied from https://github.com/karpathy/nanoGPT

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

<div>
<h3><code>class FeedFoward(nn.Module):</code></h3>
<p>The FeedFoward module is a two-layer feedforward neural network used in transformer models. It consists of a fully connected layer followed by a ReLU activation function, another fully connected layer, and a dropout layer. The module takes an input tensor x of shape (B, T, C), where B is the batch size, T is the sequence length, and C is the hidden size of the input tensor.

In the __init__ method, the FeedFoward module creates a sequential neural network with two fully connected layers. The first layer has an output size of 4 * n_embd and uses the ReLU activation function. The second layer has an output size of n_embd. Finally, a dropout layer is applied to prevent overfitting.

In the forward method, the FeedFoward module passes the input tensor x through the sequential neural network to compute the output tensor. The resulting tensor has the same shape as the input tensor.

Overall, the FeedFoward module is used to add non-linearity and increase the expressiveness of the transformer model. The two-layer feedforward neural network can learn complex representations of the input tensor, which can be helpful for tasks that require modeling complex relationships between different parts of the input, such as natural language understanding or image processing.</p>
<h4><code>Parameters:</code></h4>
<ul>
<li><code>n_embd (int)</code>: The size of the input and output tensors of the feedforward neural network.</li>
</ul>
<h4><code>Attributes:</code></h4>
<ul>
<li><code>net (nn.Sequential)</code>: Sequential neural network containing the fully connected layers and dropout layer of the feedforward neural network.</li>
</ul>
<h4><code>Inputs:</code></h4>
<ul>
<li><code>x (torch.Tensor)</code>: Input tensor of shape `(B, T, C)`, where `B` is the batch size, `T` is the sequence length, and `C` is the hidden size of the input tensor.</li>
</ul>
<h4><code>Outputs:</code></h4>
<ul>
<li><code>out (torch.Tensor)</code>: Output tensor of the same shape as the input tensor.</li>
</ul>
<h4><code>Methods:</code></h4>
<ul>
<li><code>forward(x: torch.Tensor) -&gt; torch.Tensor:</code> Computes the feedforward neural network of the input tensor `x`, and returns the output tensor.</li>
</ul>
<p>Overall, the `FeedFoward` module is used to add non-linearity and increase the expressiveness of the transformer model. The two-layer feedforward neural network can learn complex representations of the input tensor, which can be helpful for tasks that require modeling complex relationships between different parts of the input, such as natural language understanding or image processing.</p>
</div>

In [16]:
class FeedFoward(nn.Module):

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

<div>
<h3><code>class Block(nn.Module):</code></h3>
<p>The Block module is a building block of the transformer model that consists of a multi-head self-attention layer, a feedforward neural network layer, and two layer normalization layers. It takes an input tensor x of shape (B, T, C), where B is the batch size, T is the sequence length, and C is the hidden size of the input tensor.

In the __init__ method, the Block module initializes a MultiHeadAttention layer and a FeedFoward layer with the specified embedding dimension n_embd and number of attention heads n_head. It also initializes two LayerNorm layers, which are used to normalize the output of the self-attention and feedforward layers, respectively.

In the forward method, the Block module applies the self-attention layer to the input tensor, adds the output of the self-attention layer to the original input tensor using residual connections, normalizes the resulting tensor using layer normalization, applies the feedforward layer to the normalized tensor, adds the output of the feedforward layer to the previous tensor using residual connections, normalizes the resulting tensor using layer normalization, and returns the final tensor.

Overall, the Block module is a key building block of the transformer model that allows the model to process sequences of variable length and capture complex relationships between different parts of the input. By stacking multiple Block modules together, the transformer model can learn complex representations of the input sequence that can be used for a wide range of natural language processing tasks, such as machine translation, text classification, and text generation.</p>
<h4><code>Parameters:</code></h4>
<ul>
<li><code>n_embd (int)</code>: The embedding dimension of the input tensor.</li>
<li><code>n_head (int)</code>: The number of attention heads to use in the multi-head self-attention layer.</li>
</ul>
<h4><code>Attributes:</code></h4>
<ul>
<li><code>sa (MultiHeadAttention)</code>: Multi-head self-attention layer that takes the input tensor as input and returns the output tensor after applying the self-attention mechanism.</li>
<li><code>ffwd (FeedFoward)</code>: Feedforward neural network layer that takes the input tensor as input and returns the output tensor after applying a two-layer feedforward neural network.</li>
<li><code>ln1 (nn.LayerNorm)</code>: First layer normalization layer that takes the output tensor of the self-attention layer as input and returns the normalized output tensor.</li>
<li><code>ln2 (nn.LayerNorm)</code>: Second layer normalization layer that takes the output tensor of the feedforward neural network layer as input and returns the normalized output tensor.</li>
</ul>
<h4><code>Inputs:</code></h4>
<ul>
<li><code>x (torch.Tensor)</code>: Input tensor of shape `(B, T, C)`, where `B` is the batch size, `T` is the sequence length, and `C` is the hidden size of the input tensor.</li>
</ul>
<h4><code>Outputs:</code></h4>
<ul>
<li><code>x (torch.Tensor)</code>: Output tensor of the same shape as the input tensor.</li>
</ul>
<h4><code>Methods:</code></h4>
<ul>
<li><code>forward(x: torch.Tensor) -&gt; torch.Tensor:</code> Computes the output tensor by passing the input tensor through the multi-head self-attention layer, adding the resulting tensor to the input tensor using residual connections, normalizing the resulting tensor using layer normalization, passing the normalized tensor through the feedforward neural network layer, adding the resulting tensor to the previous tensor using residual connections, normalizing the resulting tensor using layer normalization, and returning the final tensor.</li>
</ul>
<p>Overall, the `Block` module is a key building block of the transformer model that allows the model to process sequences of variable length and capture complex relationships between different parts of the input. By stacking multiple `Block` modules together, the transformer model can learn complex representations of the input sequence that can be used for a wide range of natural language processing tasks, such as machine translation, text classification, and text generation.</p>
</div>

In [17]:
class Block(nn.Module):

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

# super simple bigram model
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [18]:
model = BigramLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

49.291345 M parameters


<h3> 6. Train Bigram Model </h3>

In [19]:
import time
import datetime
import pytz
import os
import matplotlib.pyplot as plt

def train_model(filename, model, max_iters, eval_interval, plateau, patience, learning_rate):
    # Check if the trained model already exists
    if os.path.exists(filename):
        use_existing_model = input('Trained model already exists. Do you want to use it? (y/n) ')
        if use_existing_model.lower() == 'y':
            # Load the existing trained model
            model.load_state_dict(torch.load(filename))
            return model
    
    # Train the model
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    
    # Create a list to store the losses
    loss_history = []

    best_val_loss = float('inf')
    num_epochs_no_improvement = 0

    start_time = time.time()
    start_datetime = datetime.datetime.now(pytz.timezone('US/Pacific'))

    for iter in range(max_iters):

        # sample a batch of data
        xb, yb = get_batch('train')

        # evaluate the loss
        logits, loss = model(xb, yb)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

        # every once in a while evaluate the loss on train and val sets
        if iter % eval_interval == 0:
            losses = estimate_loss()
            current_time = time.time()
            elapsed_time = current_time - start_time
            current_datetime = datetime.datetime.now(pytz.timezone('US/Pacific'))
            time_str = current_datetime.strftime('%Y-%m-%d %H:%M:%S %Z%z')

            # Store the loss every eval_interval iterations
            loss_history.append({'train': losses['train'], 'val': losses['val'], 'step': iter,
                                 'timestamp': current_datetime, 'elapsed_time': elapsed_time})

            # Check for plateau
            if best_val_loss - losses['val'] < plateau:
                num_epochs_no_improvement += 1
                if num_epochs_no_improvement >= patience:
                    print(f"Validation loss plateaued for {patience} epochs, stopping early...")
                    break
            else:
                best_val_loss = losses['val']
                num_epochs_no_improvement = 0

            if iter > 2 * eval_interval:
                time_elapsed = current_datetime - start_datetime
                time_per_iter = time_elapsed / iter
                est_remaining_iters = max_iters - iter
                est_remaining_time = est_remaining_iters * time_per_iter
                est_completion_time = (current_datetime + est_remaining_time).strftime('%Y-%m-%d %H:%M:%S %Z%z')
                print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}, timestamp {time_str}, elapsed time {elapsed_time:.4f}s, est. completion {est_completion_time}")
            else:
                print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}, timestamp {time_str}, elapsed time {elapsed_time:.4f}s")

        if iter == max_iters-1:
            print(f"Training completed: elapsed time {elapsed_time:.4f}s")

    # Plot the loss history
    train_losses = [l['train'] for l in loss_history]
    val_losses = [l['val'] for l in loss_history]
    steps = [l['step'] for l in loss_history]
    plt.plot(steps, train_losses, label='Train Loss')
    plt.plot(steps, val_losses, label='Validation Loss')
    plt.xlabel('Iteration')
    plt.ylabel('Loss')
    plt.title('Train and Validation Loss over Time')
    plt.legend()
    plt.show()

    # Save the trained model
    torch.save(model.state_dict(), filename)
    return model

In [20]:
#filename='shakespeare.pt'
filename='shakespeare2.pt'
trained_model = train_model(filename, model, max_iters, eval_interval, plateau, patience, learning_rate)
trained_model

KeyboardInterrupt: 

In [None]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
generated_tokens = trained_model.generate(context, max_new_tokens=1000)[0].tolist()
generated_text = gpt2enc.decode(generated_tokens)
print(generated_text)

# Output the generated text to a file
with open('generated_text.txt', 'w') as f:
    f.write(generated_text)