In [87]:
import nbimporter
import torch
import tiktoken
from GPT import GPTModel
from text_processor import create_dataloader_v1


In [88]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 256,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}

In [99]:
tokenizer = tiktoken.get_encoding("gpt2")

def generate_out_tokens(model, idx, context_length, new_tokens):
    for _ in range(new_tokens):
        idx_cond = idx[:, -context_length:]
        with torch.no_grad():
            logits = model(idx_cond)
        
        logits = logits[:, -1, :]
        probs = torch.softmax(logits, dim=-1)
        idx_next = torch.argmax(probs, dim=-1, keepdim=True)

        idx = torch.cat((idx, idx_next), dim=1)

    return idx

def text_to_token(text, tokenizer):
    tokens = torch.tensor(tokenizer.encode(text, allowed_special={'<|endoftext|>'})).unsqueeze(0)
    return tokens
    

def token_to_text(tokens, tokenizer):
    text = tokenizer.decode(tokens.squeeze(0).tolist())
    return text 

In [100]:
with open("the-verdict.txt", "r") as file:
    r = file.read() 

data_chars = len(r)
data_token = tokenizer.encode(r)
print(f"total chars: {data_chars}")
print(f"total tokens: {len(data_token)}")

total chars: 20479
total tokens: 5145


In [101]:
split_ratio = 0.85
split_idx = int(data_chars * split_ratio)
train_data = r[:split_idx]
val_data = r[split_idx:]

In [102]:
train_data_loader = create_dataloader_v1(txt=train_data, batch_size=2, max_length=GPT_CONFIG_124M["context_length"], stride=GPT_CONFIG_124M["context_length"], shuffle=True, drop_last=True, num_workers=0)

val_data_loader = create_dataloader_v1(txt=val_data, batch_size=2, max_length=GPT_CONFIG_124M["context_length"], stride=GPT_CONFIG_124M["context_length"], shuffle=True, drop_last=False, num_workers=0)

In [103]:
print("training set:")
for x, y in train_data_loader:
    print(x.shape, y.shape)
print("validation set:")
for x, y in val_data_loader:
    print(x.shape, y.shape)

training set:
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
validation set:
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([1, 256]) torch.Size([1, 256])


In [104]:
def cross_entropy(input_batch, target_batch, model, device):
    input_batch = input_batch.to(device)
    target_batch = target_batch.to(device)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(logits.flatten(0,1), target_batch.flatten())

    return loss

def batch_loss(data_loader, model, device, num_batches=None):
    total_loss = 0

    if len(data_loader) == 0:
        return float("nan")
    
    elif num_batches is None:
        num_batches = len(data_loader)

    else:
        num_batches = min(num_batches, len(data_loader))

    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = cross_entropy(input_batch, target_batch, model, device)
            total_loss += loss.item()

        else:
            break

    return total_loss / num_batches



In [105]:
device = "mps" if torch.mps.is_available() else "cpu"
gpt = GPTModel(GPT_CONFIG_124M)
gpt.to(device)

with torch.no_grad():
    train_loss = batch_loss(model=gpt, data_loader=train_data_loader, device=device)
    val_loss = batch_loss(model=gpt, device=device, data_loader=val_data_loader)
    print(f"train loss: {train_loss}")
    print(f"val loss: {val_loss}")




train loss: 10.980239629745483
val loss: 10.933394432067871


In [106]:
def sample_batch(model, tokenizer, start_context, device, cfg):
    model.eval()
    context_length = cfg["context_length"]
    tokens = text_to_token(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate_out_tokens(model=model, idx=tokens, context_length=context_length, new_tokens=50)
    text = token_to_text(token_ids, tokenizer)
    print(text.replace("/n", " "))
    model.train()


def eval_model(model, train_data_loader, val_data_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = batch_loss(train_data_loader, model, device, num_batches=eval_iter)
        val_loss   = batch_loss(val_data_loader, model, device, num_batches=eval_iter)
        model.train()

    return train_loss, val_loss



In [113]:
def gpt_trainer(model, train_loader, val_loader, optimizer, num_epochs, eval_freq, eval_iter, device, start_context, tokenizer, cfg):
    train_losses, val_losses, track_tokens = [], [], []
    tokens_seen, global_step = 0, -1

    for epoch in range(num_epochs):
        model.train()
        for input_batch, target_batch in train_loader:
            optimizer.zero_grad()
            loss = cross_entropy(input_batch=input_batch, target_batch=target_batch, model=model, device=device)
            loss.backward()
            optimizer.step()
            tokens_seen += input_batch.numel()
            global_step += 1

            if global_step % eval_freq == 0:
                train_loss, val_loss = eval_model(model, train_loader, val_loader, device, eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens.append(tokens_seen)


                print(f" epoch : {epoch}    step: {global_step}")
                print(f" train loss: {train_loss}   val loss: {val_loss}")


        sample_batch(model, tokenizer, start_context, device, cfg)
    
    return train_losses, val_losses, track_tokens




In [114]:
torch.manual_seed(123)
gpt = GPTModel(GPT_CONFIG_124M)
gpt.to(device)
optimizer = torch.optim.AdamW(gpt.parameters(), weight_decay=0.1, lr=0.0004)
start_context = "hello, my name is"
num_epochs = 1

tl, vl, ts = gpt_trainer(model=gpt, train_loader=train_data_loader, val_loader=val_data_loader, optimizer=optimizer, num_epochs=num_epochs, eval_freq=2, eval_iter=5, device=device, start_context=start_context, tokenizer=tokenizer, cfg=GPT_CONFIG_124M)

 epoch : 0    step: 0
 train loss: 9.828278732299804   val loss: 10.01657772064209
 epoch : 0    step: 2
 train loss: 8.939193725585938   val loss: 9.13503646850586
 epoch : 0    step: 4
 train loss: 8.333987045288087   val loss: 8.565227031707764
 epoch : 0    step: 6
 train loss: 7.841167831420899   val loss: 7.980482339859009
hello, my name is the, the the the the the the the,, the the the,,, the the the, the,, the the,, the,, the the the, the the the,, the, the the,,,, the,
