## Setting up GPT:

In [2]:
import torch
import torch.nn as nn

In [3]:
class MultiHeadAttention(nn.Module):
    def __init__(self,d_in,d_out,
                 context_length,dropout,
                 num_heads, qkv_bias=False):
        
        super().__init__()
        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads
        self.W_query = nn.Linear(d_in,d_out,bias=qkv_bias)
        self.W_key = nn.Linear(d_in,d_out,bias=qkv_bias)
        self.W_value = nn.Linear(d_in,d_out,bias=qkv_bias)
        self.out_proj = nn.Linear(d_out,d_out)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer(
            "mask",
            torch.triu(torch.ones(context_length,context_length),
                       diagonal=1)
        )


    def forward(self,x):
        b, num_tokens , d_in = x.shape
        keys = self.W_key(x)
        queries = self.W_query(x)
        values = self.W_value(x)

        # Splitting the matrix by adding num_heads
        keys = keys.view(b,num_tokens,self.num_heads,self.head_dim)
        queries = queries.view(b,num_tokens,self.num_heads,self.head_dim)
        values = values.view(b,num_tokens,self.num_heads,self.head_dim)


        # Converting the shape
        # b,num_tokens,num_heads,head_dim = b,num_heads,num_tokens,head_dim
        # crucial for aligning the keys and values in multiple heads
        keys = keys.transpose(1,2) 
        queries = queries.transpose(1,2)
        values = values.transpose(1,2)

        attention_scores = queries @ keys.transpose(2,3) # dot product for each head
        mask_bool = self.mask.bool()[:num_tokens,:num_tokens]
        attention_scores.masked_fill_(mask_bool, -torch.inf)

        attention_weights = torch.softmax(
            attention_scores / keys.shape[-1]**0.5 , dim = -1
        )
        attention_weights = self.dropout(attention_weights)

        # b , num_token , n_heads , head_dim
        context_vectors = (attention_weights @ values).transpose(1,2) 

        # combines the heads 
        context_vectors = context_vectors.contiguous().view(
            b,num_tokens,self.d_out
        )
        context_vectors = self.out_proj(context_vectors)
        return context_vectors
    

class TransformerBlock(nn.Module):
    def __init__(self,cfg):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in = cfg["emb_dim"],
            d_out= cfg["emb_dim"],
            context_length= cfg["context_length"],
            num_heads= cfg["n_heads"],
            dropout= cfg["drop_rate"],
            qkv_bias= cfg["qkv_bias"]
        )
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])   

    def forward(self,x):
        residual = x
        x = self.norm1(x)
        x = self.att(x)
        x = self.drop_shortcut(x)
        x = x + residual

        residual = x
        x = self.norm2(x)
        x = self.ff(x)  
        x = self.drop_shortcut(x)
        x = x + residual
        return x
    

class LayerNorm(nn.Module):
    def __init__(self,emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self,x):
        mean = x.mean(dim=-1,keepdim=True)
        var = x.var(dim=-1,keepdim=True,unbiased=False)
        norm_x = (x-mean)/torch.sqrt(var+self.eps)
        return self.scale * norm_x + self.shift
    

class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self,x):
        return 0.5*x*(1+torch.tanh(
            torch.sqrt(torch.tensor(2.0/torch.pi))*
            (x+0.044715+torch.pow(x,3))
        ))
    

class FeedForward(nn.Module):
    def __init__(self,cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg["emb_dim"],cfg["emb_dim"]*4),
            GELU(),
            nn.Linear(cfg["emb_dim"]*4,cfg["emb_dim"])
        )

    def forward(self,x):
        return self.layers(x)
    

class GPTModel(nn.Module):
    def __init__(self,cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"],cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"],cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])
        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for  _ in range(cfg["n_layers"])]
        )
        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(
            cfg["emb_dim"],cfg["vocab_size"],bias=False
        )

    def forward(self,in_idx):
        batch_size , seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(
            torch.arange(seq_len,device=in_idx.device)
        )
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

In [4]:
GPT_CONFIG = {
    "vocab_size": 50257,   # Vocabulary size
    "context_length": 256, # Shortened context length (orig: 1024)
    "emb_dim": 768,        # Embedding dimension
    "n_heads": 12,         # Number of attention heads
    "n_layers": 12,        # Number of layers
    "drop_rate": 0.1,      # Dropout rate
    "qkv_bias": False      # Query-key-value bias
}

torch.manual_seed(42)
model = GPTModel(GPT_CONFIG)
model.eval();

In [5]:
import tiktoken

def generate_text(model,idx,max_new_tokens,context_size):
    for _ in range(max_new_tokens):
        idx_cond = idx[:,-context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:,-1,:] 
        probas = torch.softmax(logits,dim=-1)
        idx_next = torch.argmax(probas,dim=-1,keepdim=True)
        idx = torch.cat((idx,idx_next),dim=-1)
    return idx


def text_to_token_ids(text,tokenizer):
    encoded = tokenizer.encode(text,allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0)
    return encoded_tensor

def token_ids_to_text(token_ids,tokenizer):
    flat = token_ids.squeeze(0)
    return tokenizer.decode(flat.tolist())

start_context = "The sky is"
tokenizer = tiktoken.get_encoding("gpt2")

token_ids = generate_text(
    model=model,
    idx=text_to_token_ids(start_context,tokenizer),
    max_new_tokens=10,
    context_size=GPT_CONFIG["context_length"]
)

print(f"Output text:{token_ids_to_text(token_ids,tokenizer)}")

Output text:The sky is Schnewhy ripple gratificationcularSizecampaign lies suggestions meet


In the above cells we have setup the architecture of the model and also the function for generation text. We can see that there is incoherant text being generated by the model. This is because the model is not trained on any data. We will now train the model on the data and then generate text using the trained model.

## Calculating loss for text generation process:

In [6]:
inputs = torch.tensor([[16833, 3626, 6100],   # ["every effort moves",
                       [40,    1107, 588]])   #  "I really like"]

targets = torch.tensor([[3626, 6100, 345  ],  # [" effort moves you",
                        [1107,  588, 11311]]) #  " really like chocolate"]

In [7]:
with torch.no_grad():
    logits = model(inputs)

probas = torch.softmax(logits,dim=-1)
print(probas.shape)

torch.Size([2, 3, 50257])


The text generation process can be simplified as further:
- Given a large corpus of vocabulary , we creating a mapping of the vocabulary to token IDs
- we obtain a n-dimensional probability vector for each input token as specified in the configs
- Identify the index position with highest probability values in each row
- Obtain all the predicted token IDs as the index position with highest probability
- Map the index position back to text via an inverse vocabulary.

In [8]:
token_ids = torch.argmax(probas, dim=-1, keepdim=True)
print("Token IDs:\n", token_ids)

Token IDs:
 tensor([[[24851],
         [  406],
         [40115]],

        [[29716],
         [40825],
         [37022]]])


In [9]:
print(f"Targets batch 1: {token_ids_to_text(targets[0], tokenizer)}")
print(f"Outputs batch 1: {token_ids_to_text(token_ids[0].flatten(), tokenizer)}")

Targets batch 1:  effort moves you
Outputs batch 1: etti L HO


Since the model hasn't yet been trained on vocabulary we can see the outputs of not good quality.

In [10]:
text_idx = 0
target_probas_1 = probas[text_idx, [0, 1, 2], targets[text_idx]]
print("Text 1:", target_probas_1)

text_idx = 1
target_probas_2 = probas[text_idx, [0, 1, 2], targets[text_idx]]
print("Text 2:", target_probas_2)


Text 1: tensor([4.3679e-06, 2.1341e-05, 1.0216e-05])
Text 2: tensor([1.2381e-05, 3.0002e-05, 6.1751e-06])


In [11]:
log_probas = torch.log(torch.cat((target_probas_1, target_probas_2)))
print(log_probas)

tensor([-12.3412, -10.7549, -11.4916, -11.2994, -10.4142, -11.9950])


In [12]:
avg_log_probas = torch.mean(log_probas)
print(avg_log_probas)

tensor(-11.3827)


In [13]:
neg_avg_log_probas = avg_log_probas * -1
print(neg_avg_log_probas)

tensor(11.3827)


In [14]:
#Logits have shape (batch_size, num_tokens, vocab_size)
print("Logits shape:", logits.shape)

# Targets have shape (batch_size, num_tokens)
print("Targets shape:", targets.shape)

Logits shape: torch.Size([2, 3, 50257])
Targets shape: torch.Size([2, 3])


In [15]:
logits_flat = logits.flatten(0, 1)
targets_flat = targets.flatten()

print("Flattened logits:", logits_flat.shape)
print("Flattened targets:", targets_flat.shape)

Flattened logits: torch.Size([6, 50257])
Flattened targets: torch.Size([6])


In [16]:
loss = torch.nn.functional.cross_entropy(logits_flat, targets_flat)
print(loss)

tensor(11.3827)


In [17]:
perplexity = torch.exp(loss)
print(perplexity)

tensor(87790.6641)


- Cross entropy loss is the difference between actual and predicted probability distribution of the model.
- Perplexity is just the exponential of cross entropy
- It also refers to the probability distribution predicted by the model and how well it matches the distribution of words present in dataset.

## Training model of small dataset:

Trying to train on the model on already existant dataset which was using in text-preprocessing notebook

In [18]:
file_path = "../Text-processing-basic/data.txt"
with open(file_path, "r") as file:
    text_data = file.read()

In [19]:
total_characters = len(text_data)
total_tokens = len(tokenizer.encode(text_data))
print(f"total number of characters: {total_characters}")
print(f"total number of tokens:{total_tokens}")

total number of characters: 20479
total number of tokens:5145


In [20]:
#diving the dataset into training and validation4
train_ratio = 0.9
split_idx = int(train_ratio * len(text_data))
train_data = text_data[:split_idx]
val_data = text_data[split_idx:]   


In [21]:
len(train_data), len(val_data)

(18431, 2048)

#### Loading data:

In [22]:
import torch
from torch.utils.data import Dataset, DataLoader

class GPTDataset(Dataset):
    def __init__(self,txt,tokenizer,max_length,stride):
        self.input_ids = []
        self.target_ids = []
        token_ids = tokenizer.encode(txt)

        for i in range(0,len(token_ids)-max_length,stride):
            input_chunk = token_ids[i:i+max_length]
            target_chunk = token_ids[i+1:i+max_length+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self,idx):
        return self.input_ids[idx],self.target_ids[idx]
    

def create_dataloaderv1(txt,batch_size=4,max_length=256,
                      stride=128,shuffle=True,drop_last=True,
                      num_workers = 0):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDataset(txt,tokenizer,max_length,stride)
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle = shuffle,
        drop_last = drop_last,
        num_workers = num_workers
    )

    return dataloader

In [23]:
train_loader = create_dataloaderv1(
    train_data,
    batch_size = 2,
    max_length = GPT_CONFIG["context_length"],
    stride = GPT_CONFIG["context_length"],
    drop_last=True,
    shuffle=True,
    num_workers=0
)

val_loader = create_dataloaderv1(
    val_data,
    batch_size = 2,
    max_length = GPT_CONFIG["context_length"],
    stride = GPT_CONFIG["context_length"],
    drop_last=False,
    shuffle=False,
    num_workers=0
)

In [24]:
print(f"Train loader data sizes")
for x,y in train_loader:
    print(x.shape,y.shape)

print(f"Validation loader data sizes")
for x,y in val_loader:
    print(x.shape,y.shape)

Train loader data sizes
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
Validation loader data sizes
torch.Size([2, 256]) torch.Size([2, 256])


In [25]:
def calculate_loss_batch(input_batch,target_batch,model,device):
    """
    Function to calculate the loss for a single batch
    """
    input_batch = input_batch.to(device)
    target_batch = target_batch.to(device)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(
        logits.flatten(0,1),target_batch.flatten()
    )
    return loss

In [26]:
def calc_loss_loader(data_loader,model,device,num_batches=None):
    total_loss = 0
    if len(data_loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        num_batches = min(num_batches,len(data_loader))
    for i,(input_batch,target_batch) in enumerate(data_loader):
        if i<num_batches:
            loss = calculate_loss_batch(
                input_batch,target_batch,model,device
            )
            total_loss += loss
        else:
            break
    return total_loss/num_batches

In [27]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
with torch.no_grad(): #this is for disabling the gradient calculation
    train_loss = calc_loss_loader(train_loader,model,device)
    val_loss = calc_loss_loader(val_loader,model,device)
print(f"Training loss:{train_loss}")
print(f"Validation loss:{val_loss}")

Training loss:11.009906768798828
Validation loss:11.048162460327148


#### Training the model:

The process of training the model is as follows:
- iterate over the training epochs
- iterate over the batches for each epoch
- reset the gradients of loss from previous batch
- calculate loss for current batch
- backward pass to calculate loss gradients
- update the model weights using loss gradients
Inspect
- the losses in training and validation set
- sample text to see the quality of generated text

In [28]:
def train_model_simple(model,train_loader,val_loader,
                       optimizer,device,num_epochs,eval_freq,
                       eval_iter,start_context,tokenizer):
    train_losses , val_losses , track_tokens_seen = [],[],[]
    tokens_seen , global_step = 0,-1

    for epoch in range(num_epochs):
        model.train()
        for input_batch,target_batch in train_loader:
            optimizer.zero_grad() #resets loss gradients in prev batch
            loss = calculate_loss_batch(
                input_batch,target_batch,model,device
            )
            loss.backward() # calculate loss gradients
            optimizer.step() #update model weights
            tokens_seen += input_batch.numel() 
            global_step += 1

            if global_step % eval_freq == 0:
                train_loss , val_loss = evaluate_model(
                    model,train_loader,val_loader,device,eval_iter
                )
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)
                print(f"Epoch:{epoch+1} (Step:{global_step:06d}): "
                      f"Train loss:{train_loss:.4f}, "
                      f"Validation loss:{val_loss:.4f}")
                
        generate_print_sample(
            model,tokenizer,device,start_context
        )

    return train_losses, val_losses, track_tokens_seen

In [29]:
def evaluate_model(model,train_loader,val_loader,device,eval_iter):
    model.eval() #dropout is disabled
    with torch.no_grad(): #disable gradient tracking as  it not required during eval
        train_loss = calc_loss_loader(train_loader,model,device,eval_iter)
        val_loss = calc_loss_loader(val_loader,model,device,eval_iter)
    model.train()
    return train_loss, val_loss


def generate_print_sample(model,tokenizer,device,start_context):
    model.eval()
    context_size = model.pos_emb.weight.shape[0]
    encoded = text_to_token_ids(start_context,tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate_text(model,idx=encoded,
                                  max_new_tokens=50,
                                  context_size=context_size)
    decoded_text = token_ids_to_text(token_ids,tokenizer)
    print(f"Generated text:\n{decoded_text}")
    

In [30]:
torch.manual_seed(42)

model = GPTModel(GPT_CONFIG)
model.to(device)
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=1e-4, weight_decay=0.1)

num_epochs = 10

train_losses , val_losses, tokens_seen = train_model_simple(
    model,train_loader,val_loader,optimizer,device,
    num_epochs=num_epochs,eval_freq=5,eval_iter=5,
    start_context="The whole world is",tokenizer=tokenizer
)

Epoch:1 (Step:000000): Train loss:10.5723, Validation loss:10.6358
Epoch:1 (Step:000005): Train loss:9.2652, Validation loss:9.4483
Generated text:
The whole world is,,,, the,, the,,,, the, the,,, the the, the,,, the,, the,,,, the,,, the, the, the, the, the,, the,
Epoch:2 (Step:000010): Train loss:8.6954, Validation loss:8.9277
Epoch:2 (Step:000015): Train loss:8.1692, Validation loss:8.4476
Generated text:
The whole world is the, the, the, the, the, the, the.




































Epoch:3 (Step:000020): Train loss:7.6541, Validation loss:8.0169
Epoch:3 (Step:000025): Train loss:7.1809, Validation loss:7.6020
Generated text:
The whole world is, the, the, the, the, the, the the.




































Epoch:4 (Step:000030): Train loss:6.7261, Validation loss:7.2775
Epoch:4 (Step:000035): Train loss:6.2174, Validation loss:7.0114
Generated text:
The whole world is the.
















































Epoch:5 (Step:000040): Train loss:5.8261, Val