In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from datasets import load_dataset

from model import GPTModel
from config import Config
from load_weights_from_gpt import load_weights_into_gpt
from gpt_download import download_and_load_gpt2
from model_generate import generate
import tiktoken

from torch.utils.data import DataLoader

In [2]:
torch.manual_seed(0);

In [3]:
config = Config({
    "n_layers": 24,
    "d_model": 1024,
    "eps": 1e-5,
    "hidden_size_multiplier": 4,
    "num_heads": 16,
    "context_len": 1024,
    "dropout": 0.01,
    "qkv_bias": True,
    "vocab_size": 50257
})

In [4]:
torch.cuda.is_available()

True

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"device : {device}")

device : cuda


In [6]:
tokenizer = tiktoken.get_encoding('gpt2')

In [7]:
model = GPTModel(config=config).to(device)

In [8]:
settigns, params = download_and_load_gpt2(model_size='355M', models_dir='gpt2')

File already exists and is up-to-date: gpt2\355M\checkpoint
File already exists and is up-to-date: gpt2\355M\encoder.json
File already exists and is up-to-date: gpt2\355M\hparams.json
File already exists and is up-to-date: gpt2\355M\model.ckpt.data-00000-of-00001
File already exists and is up-to-date: gpt2\355M\model.ckpt.index
File already exists and is up-to-date: gpt2\355M\model.ckpt.meta
File already exists and is up-to-date: gpt2\355M\vocab.bpe


In [9]:
load_weights_into_gpt(model, params)

In [10]:
print(generate(model, 
               starting_context='fibonocci code in python: def fibonacci(n):', 
               tokenizer=tokenizer, 
               max_len=40))

fibonocci code in python: def fibonacci(n): if n >= 1: return results[y*n*n+1:] if n < 0: return garbage.Repr() text = `` `` print fibonacci.joined.text()


In [11]:
data = load_dataset('teknium/openhermes')

In [12]:
data

DatasetDict({
    train: Dataset({
        features: ['output', 'input', 'instruction'],
        num_rows: 242831
    })
})

In [13]:
data['train'][0]

 'input': '',
 'instruction': 'Write a Perl script that processes a log file and counts the occurrences of different HTTP status codes. The script should accept the log file path as a command-line argument and print the results to the console in descending order of frequency.\n'}

In [14]:
def apply_text_template(instr, input=''):
    template = """### Instruction: {instruction}
### Input: {input}
### Response:"""
    return template

In [15]:
def format_text_template(batch):
    template = """### Instruction: {instruction}
### Input: {input}
### Response: {output}"""
    
    return {'text': template.format(instruction=batch['instruction'], input=batch['input'], output=batch['output'])}

In [16]:
# testing map function
subset_data = data['train'].select([10,20,40])
subset_data = subset_data.map(format_text_template)

print(subset_data['text'][0])

### Instruction: Design a roller coaster with three distinct features, explaining the purpose of each feature and how it contributes to the overall ride experience.
### Input: 
### Response: 1. The Gravity-Defying Loop: One of the most iconic and thrilling features of our roller coaster is a massive, vertical loop that takes riders upside down as they travel through it at high speeds. This gravity-defying loop creates an intense sensation of weightlessness and disorientation for riders, making them feel like they are defying the laws of physics. As the train enters the loop, riders experience strong positive G-forces pushing them into their seats, followed by brief moments of weightlessness at the top of the loop before being pushed back into their seats again as they exit the loop. This feature contributes to the overall ride experience by providing a heart-pounding moment of adrenaline and excitement that leaves riders wanting more.

2. The Airtime Hills: To create a dynamic and vari

In [17]:
data = data.map(format_text_template, remove_columns=['output','input','instruction'])

In [18]:
# tokenize text in data
def tokenize(batch):
    token_ids = tokenizer.encode_batch(batch['text'], allowed_special={'<|endoftext|>'})
    return {'input_ids': token_ids}   

In [19]:
dataset = data.map(tokenize, batched=True, remove_columns=['text'])

In [20]:
# filtering any datapoint which have more that 1024 tokens
dataset = dataset.filter(lambda x: len(x['input_ids']) <= 1024)

In [21]:
tokenizer.decode(dataset['train'][0]['input_ids'])



In [22]:
dataset = dataset['train'].train_test_split(test_size=0.05, shuffle=True, seed=0)
train_dataset = dataset['train']

In [23]:
test_dataset = dataset['test'].train_test_split(test_size=0.4, seed=0)
val_dataset = test_dataset['train']
test_dataset = test_dataset['test']

In [24]:
tokenizer.eot_token

50256

In [25]:
def language_model_collate_fn(batch, max_len=None, tokenizer=tokenizer):
    # find max len seq in the batch
    pad_tok = tokenizer.eot_token
    
    batch = [seq['input_ids'] + [pad_tok] for seq in batch]
    
    batch_max_len = 0
    for seq in batch:
        if len(seq) > batch_max_len: # we give +1 since we have target token which need 1 token etc at end of token
            batch_max_len = len(seq)
            
    
    if max_len:
        max_len = max(batch_max_len, max_len)
    else:
        max_len = batch_max_len
    
    # truncate to max len
    batch = [seq[ :max_len] for seq in batch]
        
    # padding
    batch = [seq + [pad_tok] * (max_len - len(seq)) for seq in batch]
    
    # langauge modeling input_ids and output_ids
    input_ids = [seq[:-1] for seq in batch]
    target_ids = [seq[1:] for seq in batch]
    
    return torch.tensor(input_ids), torch.tensor(target_ids)

In [26]:
# # testing collate function
# language_model_collate_fn(batch=[[1,2,4,5,6],[2,3,3],[1]],
#                           tokenizer=tokenizer)

In [27]:
# initilaize dataloader
def create_dataloader(batch_size=4):
    train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, 
                            collate_fn=language_model_collate_fn, 
                            pin_memory=True, shuffle=True, 
                            drop_last=True)

    val_loader = DataLoader(dataset=val_dataset, batch_size=8, 
                            collate_fn=language_model_collate_fn)

    test_loader = DataLoader(dataset=test_dataset, batch_size=8, 
                            collate_fn=language_model_collate_fn)
    return train_loader, val_loader, test_loader

In [28]:
train_dataset

Dataset({
    features: ['input_ids'],
    num_rows: 224148
})

In [29]:
test_dataset

Dataset({
    features: ['input_ids'],
    num_rows: 4720
})

In [30]:
val_dataset

Dataset({
    features: ['input_ids'],
    num_rows: 7078
})

In [31]:
train_loader, val_loader, test_loader = create_dataloader(batch_size=2)

In [32]:
# for input_ids, target_ids in train_loader:
#     break

In [33]:
# print(tokenizer.decode(input_ids[0].tolist()))

In [34]:
# model.eval();
# model.to(device);

In [35]:
# # check input to model
# with torch.no_grad():
#     output = model(input_ids[0].unsqueeze(0).to('cuda'))

In [36]:
# loss on batch
def calc_loss_on_batch(model, input_ids, target_ids):
    logits = model(input_ids).flatten(0, 1)
    target = target_ids.view(1, -1).squeeze()
    loss = torch.nn.functional.cross_entropy(logits, target)
    return loss

In [37]:
# cal loss on entire loader for eval
def cal_loss_on_data_loader(model, data_loader, max_batch=None):
    total_loss = 0
    total_batch = len(data_loader)
    
    if max_batch:
        max_batch = min(total_batch, max_batch)
    else:
        max_batch = total_batch
    
    for i, (input_ids, target_ids) in enumerate(data_loader):
        if i == max_batch:
            break
        input_ids, target_ids = input_ids.to(device), target_ids.to(device)
        loss = calc_loss_on_batch(model, input_ids, target_ids)
        total_loss += loss.item()
        
    return total_loss / max_batch
        

In [38]:
def evaluate(model, train_loader, val_loader, max_batch):
    model.eval()
    with torch.no_grad():
        train_loss = cal_loss_on_data_loader(model, train_loader, max_batch=max_batch)
        val_loss = cal_loss_on_data_loader(model, val_loader, max_batch=max_batch)
    model.train()
    return train_loss, val_loss

In [39]:
def generate(model,
            starting_context:str='what is the opposit of saying "i love you"',
            tokenizer=tokenizer,
            max_len=10,
            sampling=True,
            temperature=0.0,
            top_k=None,
            eos_id=None):
    
    
    starting_context = apply_text_template(starting_context, input='')
    model.eval()
    input_ids = tokenizer.encode(starting_context)
    input_ids = torch.tensor(input_ids).unsqueeze(0).to(device)
    
    for i in range(max_len):
        with torch.no_grad():
            logits = model(input_ids)
            logits = logits[:,-1,:]
            
            if sampling:
                
                if top_k:
                    topk_logits, topk_pos = torch.topk(logits, k=top_k, dim=-1)
                    logits = torch.where(input=torch.tensor(float('-inf')),
                                         condition=logits < topk_logits[:,-1].reshape(-1, 1), 
                                         other=logits)
                if temperature>0.0:
                    logits = logits / temperature
                    
                probas = F.softmax(logits, dim=-1)
                idx_next = torch.multinomial(probas, num_samples=1)
                input_ids = torch.concat([input_ids, idx_next], dim=-1)
            else:
                assert temperature==0.0 and top_k is None, "You can't set temperature or topk if sampling=False"
                last_token = torch.argmax(logits, dim=-1).unsqueeze(0)
                input_ids = torch.cat([input_ids, last_token], dim=-1)
    return tokenizer.decode(input_ids.squeeze().tolist())

In [40]:
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)

In [41]:
def train(model, train_loader, val_loader, optimizer, eval_freq):
    global_step = 0
    train_loss_history = []
    val_loss_history = []
    model.to(device)
    for epoch, (input_ids, target_ids) in enumerate(train_loader):
        model.train()
        optimizer.zero_grad()
        input_ids, target_ids = input_ids.to(device), target_ids.to(device)
        loss = calc_loss_on_batch(model, input_ids, target_ids)
        loss.backward()
        optimizer.step()
        
        global_step += 1
        
        if global_step % eval_freq == 0 or epoch == 0:
            train_loss, val_loss = evaluate(model, train_loader, val_loader, max_batch=100)
            print(f"epoch: {epoch} | training_loss: {train_loss} | val_loss: {val_loss}")
            train_loss_history.append(train_loss)
            val_loss_history.append(val_loss)
            print(f"Sample Generation\n{'-'*20}")
            print(generate(model))
            
    
    return train_loss_history,  val_loss_history
            
            

In [42]:
train_losses, val_losses = train(model, train_loader, val_loader, optimizer, eval_freq=50)

RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
