In [18]:
import torch
from GPT_module import GPTModel
from utility_module import text_to_token_ids, token_ids_to_text, generate_text_simple, calc_loss_batch, calc_loss_loader, create_dataloader



In [17]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,  # vocabsize from BPE tokenizer
    "context_length": 256,  # context length
    "emb_dim": 768,  # embedding dimension
    "n_heads": 12,  # number of attention heads
    "n_layers": 12,  # number of layers
    "drop_rate": 0.1,  # dropout rate
    "qkv_bias": False  # query-key-value bias
}


*evaluate model loss function*

*it calculates the loss over the training and validation set while ensuring the model is in evaluation mode with gradient tracking and dropout disabled when calculating loss over training and validation sets*

In [6]:
def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad(): # gradient tracking, dropout disabled
        train_loss=calc_loss_loader(train_loader,model, device,num_batches=eval_iter)
        val_loss=calc_loss_loader(val_loader,model,device,num_batches=eval_iter)

    model.train() # resetting in training mode
    return train_loss, val_loss



*generate and print samples*

*it tracks models improvment during training. Generally take text snippet as input, convert to token IDs, feed into LLM and generate text*

In [11]:
def generate_and_print_sample(model, tokenizer, device, start_context):
    model.eval()
    context_size=model.pos_emb.weight.shape[0]
    encoded=text_to_token_ids(start_context,tokenizer).to(device)
    with torch.no_grad():
        token_ids=generate_text_simple(idx=encoded,max_new_tokens=50,context_size=context_size,model=model)
        decoded_text=token_ids_to_text(token_ids,tokenizer)
        print(decoded_text)
    model.train()



**The main function for pretraining LLMs**

In [12]:
def train_model_simple(model, train_loader, val_loader, optimizer, device, num_epochs, eval_freq, eval_iter, start_context, tokenizer):
    train_losses, val_losses, track_tokens_seen=[],[],[]
    tokens_seen, global_step=0,-1

    for epoch in range(num_epochs):
        model.train()
        for input_batch, target_batch in train_loader:
            optimizer.zero_grad()
            loss=calc_loss_batch(input_batch=input_batch,target_batch=target_batch,model=model,device=device)
            loss.backward()
            optimizer.step()
            tokens_seen+=input_batch.numel()
            global_step+=1

            if global_step%eval_freq==0:
                train_loss, val_loss=evaluate_model(model,train_loader,val_loader,device,eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)
                
                print(f"Epoch {epoch+1} (step{global_step:06d})| Train Loss {train_loss:.3f}| Val Loss {val_loss:.3f}")

        
        generate_and_print_sample(model,tokenizer,device,start_context)
    return train_losses,val_losses, track_tokens_seen



In [20]:
import tiktoken
tokenizer=tiktoken.get_encoding("gpt2")
file_path = 'the-verdict.txt'
with open(file_path, 'r', encoding='utf-8') as f:
    text_data = f.read()


In [21]:
total_characters = len(text_data)
total_tokens = len(tokenizer.encode(text_data))

print(total_characters)
print(total_tokens)

20479
5145


In [22]:
train_ratio = 0.90
split_idx = int(train_ratio*len(text_data))
train_data = text_data[:split_idx]
val_data = text_data[split_idx:]


In [23]:
torch.manual_seed(123)

train_loader = create_dataloader(text=train_data, batch_size=2,
                                 max_length=GPT_CONFIG_124M["context_length"], stride=GPT_CONFIG_124M["context_length"], shuffle=True, drop_last=True, num_workers=0)
val_loader = create_dataloader(text=val_data, batch_size=2,
                               max_length=GPT_CONFIG_124M["context_length"], stride=GPT_CONFIG_124M["context_length"], shuffle=False, drop_last=False, num_workers=0)



In [None]:
torch.manual_seed(123)
model=GPTModel(GPT_CONFIG_124M)
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device=device)
optimizer=torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)
num_epochs=10
train_losses, val_losses, tokens_seen=train_model_simple(model=model,train_loader=train_loader,val_loader=val_loader,optimizer=optimizer,device=device,num_epochs=num_epochs,eval_freq=5, eval_iter=1, start_context="Every effort moves you",tokenizer=tokenizer)

