In [1]:
import torch
import pandas as pd
import numpy as np
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
from tqdm import tqdm
import sys, os, math

sys.path.insert(0, '../dlp')
from data_access import PQDataAccess
from data_process import *

pd.set_option('future.no_silent_downcasting', True)
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
print(device)

batch_size = 64
block_size = 32
da = PQDataAccess("/home/aac/Alireza/datasets/taxseq/corpus_1000", batch_size)
epochs= 10_000
val_epoch = 100
num_val = 25

model_name = "Karpethy_GPT"
checkpoint_dir = f"../checkpoints/{model_name}_checkpoints"

if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)
print(checkpoint_dir)

 WORLD_SIZE=1 , LOCAL_WORLD_SIZE=1,RANK =0,LOCAL_RANK = 0 


  from .autonotebook import tqdm as notebook_tqdm


Loaded dictionary.
cuda:0
../checkpoints/Karpethy_GPT_checkpoints


In [2]:
def estimate_loss(eval_iters):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            tensor_batch = GPT_data_to_tensor_batch(da.get_batch(), block_size)
            tensor_batch.gpu(device)
            
            _, loss = model(tensor_batch.input_ids, tensor_batch.output_ids)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [3]:
from models.GPT import GPTLanguageModel

vocab_size = 23
n_embd = 512
n_head = 8
n_layer = 6
dropout = 0.2

model = GPTLanguageModel(vocab_size, block_size, n_embd, n_head, n_layer, dropout, device).to(device)
print(sum(p.numel() for p in model.parameters()) / 1e6, 'M parameters')

optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)

18.946071 M parameters


In [4]:
for iter_ in range(epochs):
    # every once in a while evaluate the loss on train and val sets
    if iter_ % val_epoch == 0 or iter_ == epochs - 1:
        losses = estimate_loss(num_val)
        print(f"step {iter_}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    tensor_batch = GPT_data_to_tensor_batch(da.get_batch(), block_size)
    tensor_batch.gpu(device)
    
    _, loss = model(tensor_batch.input_ids, tensor_batch.output_ids)
    
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

step 0: train loss 3.2579, val loss 3.2510
step 100: train loss 2.9004, val loss 2.8926
step 200: train loss 2.8879, val loss 2.8944
step 300: train loss 2.8537, val loss 2.8737
step 400: train loss 2.8711, val loss 2.8640
step 500: train loss 2.8649, val loss 2.8716
step 600: train loss 2.8567, val loss 2.8571
step 700: train loss 2.8507, val loss 2.8595
step 800: train loss 2.8605, val loss 2.8842
step 900: train loss 2.8532, val loss 2.8572
step 1000: train loss 2.8559, val loss 2.8585
step 1100: train loss 2.8546, val loss 2.8500
step 1200: train loss 2.8557, val loss 2.8508
step 1300: train loss 2.8412, val loss 2.8513
step 1400: train loss 2.8464, val loss 2.8610
step 1500: train loss 2.8533, val loss 2.8527
step 1600: train loss 2.8424, val loss 2.8646
step 1700: train loss 2.8585, val loss 2.8465
step 1800: train loss 2.8480, val loss 2.8575
step 1900: train loss 2.8423, val loss 2.8462
step 2000: train loss 2.8299, val loss 2.8461
step 2100: train loss 2.8542, val loss 2.8447


In [15]:
# generate from the model
for _ in range(1):
    context = torch.ones((1, 1), device=device).to(torch.long)
    output = generate(context, max_new_tokens=50, block_size=block_size)[0].tolist()

    print(*[special_idx_to_char[s] for s in output])

22
12
21
4
17
11
22
8
21
22
5
19
8
8
18
15
3
17
17
6
15
7
15
3
11
10
15
16
5
19
20
9
10
15
20
17
8
18
8
14
11
5
7
8
5
5
15
19
3
15
<s> Y L W C R K Y G W Y D T G G S P A R R E P F P A K I P Q D T V H I P V R G S G N K D F G D D P T A P


In [14]:
from torch.nn import functional as F


def generate(idx, max_new_tokens, block_size):
    # idx is (B, T) array of indices in the current context
    for _ in range(max_new_tokens):
        # crop idx to the last block_size tokens
        idx_cond = idx[:, -block_size:]
        # get the predictions
        logits, loss = model(idx_cond)
        # focus only on the last time step
        logits = logits[:, -1, :]  # becomes (B, C)
        # apply softmax to get probabilities
        probs = F.softmax(logits, dim=-1)  # (B, C)
        # sample from the distribution
        idx_next = torch.multinomial(probs, num_samples=1)  # (B, 1)
        # append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1)  # (B, T+1)

        print(idx_next[0].item())
    return idx
