In [11]:
import tiktoken
import torch
import torch.nn as nn

import pandas as pd
import sys
sys.path.append('/home/oskar/nlll/NLP_Learning/NLP_Learning')

from module import *
from functions import *
from dataloader import *
from preprocessing import *


torch.manual_seed(123)
torch.set_printoptions(sci_mode=False)

In [12]:
GPT_config_124M = {
    "vocab_size": 50257,
    "context_length": 256,
    "emb_dim": 768,
    "num_head": 12,
    "n_layers": 12,
    "dropout": 0.1,
    "qkv_bias": False
}

In [13]:
text_data = read_csv_text()

In [35]:
len(text_data)

20479

In [14]:
len(text_data)

20479

In [31]:
train_ratio = 0.90
split_idx = int(train_ratio * len(text_data))
train_data = text_data[:split_idx]
test_data = text_data[split_idx:]

train_dataloader = create_dataloader_v1(
    train_data,
    batch_size=2,
    max_length = GPT_config_124M["context_length"],
    stride = GPT_config_124M["context_length"],
    shuffle = True,
    drop_last = True,
    num_workers = 0
)

val_dataloader = create_dataloader_v1(
    test_data,
    batch_size=2,
    max_length = GPT_config_124M["context_length"],
    stride = GPT_config_124M["context_length"],
    shuffle = False,
    drop_last = False,
    num_workers = 0
)

In [16]:
torch.cuda.empty_cache()

In [32]:
class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]


def create_dataloader_v1(txt, batch_size=4, max_length=256,
                         stride=128, shuffle=True, drop_last=True, num_workers=0):
    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=0)

    return dataloader

In [39]:
#from previous_chapters import create_dataloader_v1

# Train/validation ratio
train_ratio = 0.90
split_idx = int(train_ratio * len(text_data))
train_data = text_data[:split_idx]
val_data = text_data[split_idx:]


torch.manual_seed(123)

train_loader = create_dataloader_v1(
    train_data,
    batch_size=2,
    max_length=GPT_config_124M["context_length"],
    stride=GPT_config_124M["context_length"],
    drop_last=True,
    shuffle=True,
    num_workers=0
)

val_loader = create_dataloader_v1(
    val_data,
    batch_size=2,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=False,
    shuffle=False,
    num_workers=0
)

In [18]:
def train_model_simple(model, train_loader, val_loader, optimizer, device, 
                       eval_freq, eval_iter, start_context):
    train_loss, val_loss, track_tokens_seen = [], [], []
    tokens_seen, global_step = 0, -1

    for epoch in range(num_epochs):
        model.train()
        for input_batch, target_batch in train_loader:
            optimizer.zero_grad()
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            loss.backward()
            optimizer.step()
            tokens_seen += input_batch.numel()
            global_step += 1

            if global_step % eval_freq == 0:
                train_loss_val, val_loss_val = evaluate_model(model, train_loader, val_loader, device, eval_iter)
                train_loss.append(train_loss_val)
                val_loss.append(val_loss_val)
                track_tokens_seen.append(tokens_seen)
                print("Epoch", epoch, "Step", global_step, "Train loss", train_loss[-1], "Val loss", val_loss[-1])

        generate_and_print_sample(model, train_loader.dataset.tokenizer, device, start_context, eval_iter)
    return train_loss, val_loss, track_tokens_seen

In [19]:
def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader, model, device)
        val_loss = calc_loss_loader(val_loader, model, device)
    model.train()
    return train_loss, val_loss


def generate_and_print_sample(model, tokenizer, device, start_context, eval_iter):
    model.eval()
    context_size = model.pos_emb.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate_text_simple(model, encoded, eval_iter, context_size)
        decoded_text = token_to_text(token_ids, tokenizer)
        print(decoded_text.replace('\n', ' '))
    model.train()

In [20]:
#device = 'cpu'

In [24]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(123)
model = GPTModel(GPT_config_124M)
model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.0004, weight_decay=0.1)
num_epochs = 10

train_loss, val_loss, tokens_seen = train_model_simple(
    model, train_dataloader, val_dataloader, optimizer,
    device, eval_freq=5, eval_iter=5, start_context="Every effort moves you"
)

Epoch 0 Step 0 Train loss 10.642916997273764 Val loss 10.659255027770996
Epoch 0 Step 5 Train loss 9.47816605038113 Val loss 9.640358924865723
Every effort moves you,, the,,
Epoch 1 Step 10 Train loss 8.863042407565647 Val loss 9.071943283081055
Epoch 1 Step 15 Train loss 8.434476322597927 Val loss 8.653169631958008
Every effort moves you,, the,,
Epoch 2 Step 20 Train loss 8.017120944129097 Val loss 8.230504989624023


KeyboardInterrupt: 

In [None]:
asdfadsf

In [None]:
import torch
import tiktoken

# Get the tokenizer for "gpt2" encoding
tokenizer = tiktoken.get_encoding("gpt2")

# Create a tensor for the tokens
token_tensor = torch.tensor([6109, 3626, 6100, 345, 262])

# Decode the token ids directly from a flat list (converted from tensor to list)
decoded_text = tokenizer.decode(token_tensor.tolist())

print(decoded_text)


Every effort moves you the
