In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import re
from tokenizer_module import Tokenizer  # Importing Tokenizer module if it's defined elsewhere

In [None]:
# Defining the GPT model
class GPT(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_heads, ff_dim, num_layers, max_sequence_len, dropout_rate):
        super(GPT, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)  
        self.positional_encoding = nn.Embedding(max_sequence_len, embedding_dim)  
        self.transformer_layers = nn.TransformerDecoderLayer(embedding_dim, num_heads, ff_dim, dropout_rate)  
        # Transformer decoder to stack multiple transformer layers
        self.transformer = nn.TransformerDecoder(self.transformer_layers, num_layers)  
        # Fully connected layer to produce output logits for each word in the vocabulary
        self.fc = nn.Linear(embedding_dim, vocab_size)  

    def forward(self, x):
        # Getting the batch size and sequence length of the input
        batch_size, seq_len = x.size()  
        # Generating positional encoding for the input sequence
        positional_encoding = self.positional_encoding(torch.arange(seq_len, device=x.device)).unsqueeze(0).expand(batch_size, -1, -1)  
        # Adding positional encoding to the input embeddings
        x = self.embedding(x) + positional_encoding  
        # Passing the input sequence through the transformer decoder
        x = self.transformer(x)  
        # Project the transformer output to the vocab size
        x = self.fc(x)  
        return x

In [None]:
# Setting the hyperparameters
vocab_size = 10000  
embedding_dim = 256  
num_heads = 8  
ff_dim = 512  
num_layers = 6  
max_sequence_len = 100  
dropout_rate = 0.1  
num_epochs = 10 

batch_size = 32  

# Creating the GPT model instance
model = GPT(vocab_size, embedding_dim, num_heads, ff_dim, num_layers, max_sequence_len, dropout_rate)

# Defining the optimizer and loss function
# I used AdamW which is an advanced model of the Adam optimizer
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.01)  
# Cross-entropy loss function for calculating the model's loss
criterion = nn.CrossEntropyLoss() 

In [None]:
# Loading and pre-processing my text data
with open('wiz_of_oz.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# Preprocessing: Convert to lowercase, remove punctuation
text = text.lower()
text = re.sub(r'[^\w\s]', '', text)

# Tokenizing the text into word indices
tokenizer = Tokenizer(text)  # Assuming Tokenizer is defined in tokenizer_module
input_sequences = tokenizer.tokenize_sentences(text)

# Converting input sequences to PyTorch tensors
input_sequences_tensor = torch.tensor(input_sequences)

# Target sequences (shifted input sequences by one position)
target_sequences_tensor = input_sequences_tensor[:, 1:]


In [None]:
# Training loop
for epoch in range(num_epochs):  
    # Clear the gradients from the previous iteration
    optimizer.zero_grad()  
    # Forward pass: to compute predicted logits
    output = model(input_sequences_tensor)  
    # To calculate the loss between predicted logits and target sequences
    loss = criterion(output.view(-1, vocab_size), target_sequences_tensor.view(-1))  
    # Backward pass: calculate gradients
    loss.backward()  
    # Updating model parameters based on gradients calculated
    optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item()}")
