In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from datasets import load_dataset

from model import GPTModel
from config import Config
from load_weights_from_gpt import load_weights_into_gpt
from gpt_download import download_and_load_gpt2
from model_generate import generate
import tiktoken

In [2]:
config = Config({
    "n_layers": 24,
    "d_model": 1024,
    "eps": 1e-5,
    "hidden_size_multiplier": 4,
    "num_heads": 16,
    "context_len": 1024,
    "dropout": 0.01,
    "qkv_bias": True,
    "vocab_size": 50257
})

In [3]:
torch.cuda.is_available()

True

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"device : {device}")

device : cuda


In [5]:
tokenizer = tiktoken.get_encoding('gpt2')

In [6]:
model = GPTModel(config=config).to(device)

In [7]:
settigns, params = download_and_load_gpt2(model_size='355M', models_dir='gpt2')

File already exists and is up-to-date: gpt2\355M\checkpoint
File already exists and is up-to-date: gpt2\355M\encoder.json
File already exists and is up-to-date: gpt2\355M\hparams.json
File already exists and is up-to-date: gpt2\355M\model.ckpt.data-00000-of-00001
File already exists and is up-to-date: gpt2\355M\model.ckpt.index
File already exists and is up-to-date: gpt2\355M\model.ckpt.meta
File already exists and is up-to-date: gpt2\355M\vocab.bpe


In [8]:
load_weights_into_gpt(model, params)

In [14]:
print(generate(model, 
               starting_context='fibonocci code in python: def fibonacci(n):', 
               tokenizer=tokenizer, 
               max_len=40))

fibonocci code in python: def fibonacci(n): """Return the Fibonacci number at the given number of points """ res = (n-1)*(length(uniform_rst)**5)/asctip=line.default


In [10]:
data = load_dataset('teknium/openhermes')

In [11]:
data

DatasetDict({
    train: Dataset({
        features: ['output', 'input', 'instruction'],
        num_rows: 242831
    })
})

In [12]:
data['train'][0]

 'input': '',
 'instruction': 'Write a Perl script that processes a log file and counts the occurrences of different HTTP status codes. The script should accept the log file path as a command-line argument and print the results to the console in descending order of frequency.\n'}

In [44]:
def format_text_template(batch):
    template = """### Instruction: {instruction}
### Input: {input}
### Response: {output}"""
    
    return {'text': template.format(instruction=batch['instruction'], input=batch['input'], output=batch['output'])}

In [48]:
# testing map function
subset_data = data['train'].select([10,20,40])
subset_data = subset_data.map(format_text_template)

print(subset_data['text'][0])

### Instruction: Design a roller coaster with three distinct features, explaining the purpose of each feature and how it contributes to the overall ride experience.
### Input: 
### Response: 1. The Gravity-Defying Loop: One of the most iconic and thrilling features of our roller coaster is a massive, vertical loop that takes riders upside down as they travel through it at high speeds. This gravity-defying loop creates an intense sensation of weightlessness and disorientation for riders, making them feel like they are defying the laws of physics. As the train enters the loop, riders experience strong positive G-forces pushing them into their seats, followed by brief moments of weightlessness at the top of the loop before being pushed back into their seats again as they exit the loop. This feature contributes to the overall ride experience by providing a heart-pounding moment of adrenaline and excitement that leaves riders wanting more.

2. The Airtime Hills: To create a dynamic and vari

In [None]:
data = data.map(format_text_template, remove_columns=['output','input','instruction'])

Map:   0%|          | 0/242831 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['output', 'input', 'instruction', 'text'],
        num_rows: 242831
    })
})