In [1]:
import tiktoken
import torch
import torch.nn as nn

import pandas as pd
import sys
sys.path.append('/home/oskar/nlll/NLP_Learning/NLP_Learning')

from module import *
from functions import *
from dataloader import *
from preprocessing import *


torch.manual_seed(123)
torch.set_printoptions(sci_mode=False)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
GPT_config_124M = {
    "vocab_size": 50257,
    "context_length": 256,
    "emb_dim": 768,
    "num_head": 12,
    "n_layers": 12,
    "dropout": 0.1,
    "qkv_bias": False
}

In [3]:
model = GPTModel(GPT_config_124M)

In [4]:
text_data = read_csv_text()

In [5]:
train_ratio = 0.90
split_idx = int(train_ratio * len(text_data))
train_data = text_data[:split_idx]
test_data = text_data[split_idx:]

train_dataloader = create_dataloader(
    train_data,
    batch_size=2,
    max_length = GPT_config_124M["context_length"],
    stride = GPT_config_124M["context_length"],
    shuffle = True,
    drop_last = True,
    num_workers = 0
)

val_dataloader = create_dataloader(
    test_data,
    batch_size=2,
    max_length = GPT_config_124M["context_length"],
    stride = GPT_config_124M["context_length"],
    shuffle = False,
    drop_last = False,
    num_workers = 0
)

In [6]:
torch.cuda.empty_cache()

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#device = 'cpu'
model.to(device)

train_loss = calc_loss_loader(train_dataloader, model, device)
val_loss = calc_loss_loader(val_dataloader, model, device)

print("Training loss:", train_loss)
print("Validation loss:", val_loss)

Training loss: 10.973967658148872
Validation loss: 10.948689460754395


In [8]:
def train_model_simple(model, train_loader, val_loader, optimizer, device, 
                       eval_freq, eval_iter, start_context):
    train_loss, val_loss, track_tokens_seen = [], [], []
    tokens_seen, global_step = 0, -1

    for epoch in range(num_epochs):
        model.train()
        for input_batch, target_batch in train_loader:
            optimizer.zero_grad()
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            loss.backward()
            optimizer.step()
            tokens_seen += input_batch.numel()
            global_step += 1

            if global_step % eval_freq == 0:
                train_loss_val, val_loss_val = evaluate_model(model, train_loader, val_loader, device, eval_iter)
                train_loss.append(train_loss_val)
                val_loss.append(val_loss_val)
                track_tokens_seen.append(tokens_seen)
                print("Epoch", epoch, "Step", global_step, "Train loss", train_loss[-1], "Val loss", val_loss[-1])

        generate_and_print_sample(model, train_loader.dataset.tokenizer, device, start_context, eval_iter)
    return train_loss, val_loss, track_tokens_seen

In [9]:
def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader, model, device)
        val_loss = calc_loss_loader(val_loader, model, device)
    model.train()
    return train_loss, val_loss


def generate_and_print_sample(model, tokenizer, device, start_context, eval_iter):
    model.eval()
    context_size = model.pos_emb.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate_text_simple(model, encoded, eval_iter, context_size)
        decoded_text = token_to_text(token_ids, tokenizer)
        print(decoded_text.replace('\n', ' '))
    model.train()

In [10]:
torch.manual_seed(123)
model = GPTModel(GPT_config_124M)
model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.0004, weight_decay=0.1)
num_epochs = 20

train_loss, val_loss, tokens_seen = train_model_simple(
    model, train_dataloader, val_dataloader, optimizer,
    device, eval_freq=5, eval_iter=5, start_context="Every effort moves you"
)

Epoch 0 Step 0 Train loss 10.642916997273764 Val loss 10.659255027770996
Epoch 0 Step 5 Train loss 9.47816605038113 Val loss 9.640358924865723
Every effort moves you,, the,,
Epoch 1 Step 10 Train loss 8.863042407565647 Val loss 9.071943283081055
Epoch 1 Step 15 Train loss 8.434476322597927 Val loss 8.653169631958008
Every effort moves you,, the,,
Epoch 2 Step 20 Train loss 8.017120944129097 Val loss 8.230504989624023
Epoch 2 Step 25 Train loss 7.684959676530626 Val loss 7.881529808044434
Every effort moves you the, the, the
Epoch 3 Step 30 Train loss 7.470898840162489 Val loss 7.666396617889404
Epoch 3 Step 35 Train loss 7.302070246802436 Val loss 7.504195690155029
Every effort moves you the, the, the
Epoch 4 Step 40 Train loss 7.173399024539524 Val loss 7.383875370025635
Every effort moves you,, and,,
Epoch 5 Step 45 Train loss 7.06577385796441 Val loss 7.2563066482543945
Epoch 5 Step 50 Train loss 6.977528889973958 Val loss 7.190995693206787
Every effort moves you, the   
Epoch 6 Ste

In [11]:
device

device(type='cuda')

In [12]:
dasfdsaf

NameError: name 'dasfdsaf' is not defined

In [None]:
tokenizer = tiktoken.get_encoding("gpt2")
tokenizer.decode(torch.tensor([[6109, 3626, 6100,  345,  262]]).unsqueeze(0).tolist())

TypeError: argument 'tokens': 'list' object cannot be interpreted as an integer

In [None]:
import torch
import tiktoken

# Get the tokenizer for "gpt2" encoding
tokenizer = tiktoken.get_encoding("gpt2")

# Create a tensor for the tokens
token_tensor = torch.tensor([6109, 3626, 6100, 345, 262])

# Decode the token ids directly from a flat list (converted from tensor to list)
decoded_text = tokenizer.decode(token_tensor.tolist())

print(decoded_text)


Every effort moves you the
