In [37]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from datasets import load_dataset

from model import GPTModel
from config import Config
from load_weights_from_gpt import load_weights_into_gpt
from gpt_download import download_and_load_gpt2
from model_generate import generate
import tiktoken

from torch.utils.data import DataLoader

In [4]:
config = Config({
    "n_layers": 24,
    "d_model": 1024,
    "eps": 1e-5,
    "hidden_size_multiplier": 4,
    "num_heads": 16,
    "context_len": 1024,
    "dropout": 0.01,
    "qkv_bias": True,
    "vocab_size": 50257
})

In [5]:
torch.cuda.is_available()

True

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"device : {device}")

device : cuda


In [7]:
tokenizer = tiktoken.get_encoding('gpt2')

In [8]:
model = GPTModel(config=config).to(device)

In [9]:
settigns, params = download_and_load_gpt2(model_size='355M', models_dir='gpt2')

File already exists and is up-to-date: gpt2\355M\checkpoint
File already exists and is up-to-date: gpt2\355M\encoder.json
File already exists and is up-to-date: gpt2\355M\hparams.json
File already exists and is up-to-date: gpt2\355M\model.ckpt.data-00000-of-00001
File already exists and is up-to-date: gpt2\355M\model.ckpt.index
File already exists and is up-to-date: gpt2\355M\model.ckpt.meta
File already exists and is up-to-date: gpt2\355M\vocab.bpe


In [10]:
load_weights_into_gpt(model, params)

In [11]:
print(generate(model, 
               starting_context='fibonocci code in python: def fibonacci(n):', 
               tokenizer=tokenizer, 
               max_len=40))

fibonocci code in python: def fibonacci(n): """ creates an infinite loop by square 1"""for i in range(100, N): circled += Ncatches=(i+0.5*i+100) how long += 1return '


In [12]:
data = load_dataset('teknium/openhermes')

In [13]:
data

DatasetDict({
    train: Dataset({
        features: ['output', 'input', 'instruction'],
        num_rows: 242831
    })
})

In [14]:
data['train'][0]

 'input': '',
 'instruction': 'Write a Perl script that processes a log file and counts the occurrences of different HTTP status codes. The script should accept the log file path as a command-line argument and print the results to the console in descending order of frequency.\n'}

In [15]:
def format_text_template(batch):
    template = """### Instruction: {instruction}
### Input: {input}
### Response: {output}"""
    
    return {'text': template.format(instruction=batch['instruction'], input=batch['input'], output=batch['output'])}

In [16]:
# testing map function
subset_data = data['train'].select([10,20,40])
subset_data = subset_data.map(format_text_template)

print(subset_data['text'][0])

### Instruction: Design a roller coaster with three distinct features, explaining the purpose of each feature and how it contributes to the overall ride experience.
### Input: 
### Response: 1. The Gravity-Defying Loop: One of the most iconic and thrilling features of our roller coaster is a massive, vertical loop that takes riders upside down as they travel through it at high speeds. This gravity-defying loop creates an intense sensation of weightlessness and disorientation for riders, making them feel like they are defying the laws of physics. As the train enters the loop, riders experience strong positive G-forces pushing them into their seats, followed by brief moments of weightlessness at the top of the loop before being pushed back into their seats again as they exit the loop. This feature contributes to the overall ride experience by providing a heart-pounding moment of adrenaline and excitement that leaves riders wanting more.

2. The Airtime Hills: To create a dynamic and vari

In [17]:
data = data.map(format_text_template, remove_columns=['output','input','instruction'])

In [18]:
# tokenize text in data
def tokenize(batch):
    token_ids = tokenizer.encode_batch(batch['text'], allowed_special={'<|endoftext|>'})
    return {'input_ids': token_ids}   

In [None]:
dataset = data.map(tokenize, batched=True, remove_columns=['text'])

In [20]:
tokenizer.decode(dataset['train'][0]['input_ids'])



In [22]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'input_ids'],
        num_rows: 242831
    })
})

In [24]:
tokenizer.eot_token

50256

In [39]:
def language_model_collate_fn(batch, max_len=None, tokenizer=tokenizer):
    # find max len seq in the batch
    pad_tok = tokenizer.eot_token
    new_batch = []
    final_batch = []
    
    batch = [seq + [pad_tok] for seq in batch]
    
    batch_max_len = 0
    for seq in batch:
        if len(seq) > batch_max_len: # we give +1 since we have target token which need 1 token etc at end of token
            batch_max_len = len(seq)
            
    
    if max_len:
        max_len = max(batch_max_len, max_len)
    else:
        max_len = batch_max_len
    
    # truncate to max len
    batch = [seq[ :max_len] for seq in batch]
        
    # padding
    batch = [seq + [pad_tok] * (max_len - len(seq)) for seq in batch]
    
    # langauge modeling input_ids and output_ids
    input_ids = [seq[:-1] for seq in batch]
    target_ids = [seq[1:] for seq in batch]
    
    return torch.tensor(input_ids), torch.tensor(target_ids)

In [40]:
# testing collate function
language_model_collate_fn(batch=[[1,2,4,5,6],[2,3,3],[1]],
                          tokenizer=tokenizer)

(tensor([[    1,     2,     4,     5,     6],
         [    2,     3,     3, 50256, 50256],
         [    1, 50256, 50256, 50256, 50256]]),
 tensor([[    2,     4,     5,     6, 50256],
         [    3,     3, 50256, 50256, 50256],
         [50256, 50256, 50256, 50256, 50256]]))

In [None]:
# initilaize dataloader
train_loader = DataLoader(dataset=dataset['train'], batch_size=8, 
                          collate_fn=language_model_collate_fn, 
                          pin_memory=True, shuffle=True, 
                          drop_last=True)

val_loader = DataLoader(dataset=dataset['val'], batch_size=8, 
                          collate_fn=language_model_collate_fn)

test_loader = DataLoader(dataset=dataset['test'], batch_size=8, 
                          collate_fn=language_model_collate_fn)