In [29]:
import nbimporter
import torch
import tiktoken
from GPT import GPTModel
from text_processor import create_dataloader_v1


In [13]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 256,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}

In [30]:
tokenizer = tiktoken.get_encoding("gpt2")

def generate_out_tokens(model, idx, context_length, new_tokens):
    for _ in range(new_tokens):
        idx_cond = idx[:, -context_length:]
        with torch.no_grad():
            logits = model(idx_cond)
            print(logits.shape)
        
        logits = logits[:, -1, :]
        probs = torch.softmax(logits, dim=-1)
        idx_next = torch.argmax(probs, dim=-1, keepdim=True)

        idx = torch.cat((idx, idx_next), dim=1)

    return idx

def text_to_token(text, tokenizer):
    tokens = torch.tensor(tokenizer.encode(text, allowed_special={'<|endoftext|>'})).unsqueeze(0)
    return tokens
    

def token_to_text(tokens, tokenizer):
    text = tokenizer.decode(tokens.squeeze(0).tolist())
    return text 

In [35]:
with open("the-verdict.txt", "r") as file:
    r = file.read() 

data_chars = len(r)
data_token = tokenizer.encode(r)
print(f"total chars: {data_chars}")
print(f"total tokens: {len(data_token)}")

total chars: 20479
total tokens: 5145


In [37]:
split_ratio = 0.85
split_idx = int(data_chars * split_ratio)
train_data = r[:split_idx]
val_data = r[split_idx:]

In [52]:
train_data_loader = create_dataloader_v1(txt=train_data, batch_size=2, max_length=GPT_CONFIG_124M["context_length"], stride=GPT_CONFIG_124M["context_length"], shuffle=True, drop_last=True, num_workers=0)

val_data_loader = create_dataloader_v1(txt=val_data, batch_size=2, max_length=GPT_CONFIG_124M["context_length"], stride=GPT_CONFIG_124M["context_length"], shuffle=True, drop_last=False, num_workers=0)

In [55]:
print("training set:")
for x, y in train_data_loader:
    print(x.shape, y.shape)
print("validation set:")
for x, y in val_data_loader:
    print(x.shape, y.shape)

training set:
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
validation set:
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([1, 256]) torch.Size([1, 256])


In [64]:
def cross_entropy(input_batch, target_batch, model, device):
    input_batch = input_batch.to(device)
    target_batch = target_batch.to(device)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(logits.flatten(0,1), target_batch.flatten())

    return loss

def batch_loss(data_loader, model, device, num_batches=None):
    total_loss = 0

    if len(data_loader) == 0:
        return float("nan")
    
    elif num_batches is None:
        num_batches = len(data_loader)

    else:
        num_batches = min(num_batches, len(data_loader))

    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = cross_entropy(input_batch, target_batch, model, device)
            total_loss += loss.item()

        else:
            break

    return total_loss / num_batches



In [28]:
torch.manual_seed(123)
gpt = GPTModel(GPT_CONFIG_124M)
gpt.eval()
start_context = "Every effort moves you"
token_ids = generate_out_tokens(model=gpt, context_length=GPT_CONFIG_124M["context_length"], idx=text_to_token(start_context, tokenizer),
new_tokens=10)

#text = token_to_text(token_ids, tokenizer)
print(token_ids.shape)



torch.Size([1, 4, 50257])
torch.Size([1, 5, 50257])
torch.Size([1, 6, 50257])
torch.Size([1, 7, 50257])
torch.Size([1, 8, 50257])
torch.Size([1, 9, 50257])
torch.Size([1, 10, 50257])
torch.Size([1, 11, 50257])
torch.Size([1, 12, 50257])
torch.Size([1, 13, 50257])
torch.Size([1, 14])
