In [None]:
# preparing the dataset for pre-training
# openwebtxt data

In [6]:
!pip install datasets
!pip install tiktoken
!pip install tqdm
!pip install numpy



In [7]:
! nvidia-smi

Tue Apr 30 13:00:43 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   54C    P8              10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [8]:
base_path = '/content/tokenised_weights'

In [9]:
import os
from tqdm import tqdm
import numpy as np
import tiktoken
from datasets import load_dataset # huggingface datasets

# number of workers in .map() call
# good number to use is ~order number of cpu cores // 2
num_proc = 8

# number of workers in load_dataset() call
# best number might be different from num_proc above as it also depends on NW speed.
# it is better than 1 usually though
num_proc_load_dataset = num_proc

enc = tiktoken.get_encoding("gpt2")
base_path = '/content/tokenised_weights'
if not os.path.exists(base_path):
    os.makedirs(base_path)

if __name__ == '__main__':
    # takes 54GB in huggingface .cache dir, about 8M documents (8,013,769)
    dataset = load_dataset("openwebtext", num_proc=num_proc_load_dataset)
    # dataset = load_dataset("openwebtext", num_proc=num_proc_load_dataset)
    # dataset = load_dataset('stas/openwebtext-10k')
    base_path = '/content/tokenised_weights'
    if not os.path.exists(base_path):
      os.makedirs(base_path)
    # owt by default only contains the 'train' split, so create a test split

    split_dataset = dataset["train"].train_test_split(test_size=0.0005, seed=2357, shuffle=True)
    split_dataset['val'] = split_dataset.pop('test')

    # split_dataset = dataset.train_test_split(test_size=0.0005, seed=2357, shuffle=True)
    # split_dataset['val'] = split_dataset.pop('test') # rename the test split to val

    def process(example):
        ids = enc.encode_ordinary(example['text']) # encode_ordinary ignores any special tokens
        ids.append(enc.eot_token) # add the end of text token, e.g. 50256 for gpt2 bpe
        # note: I think eot should be prepended not appended... hmm. it's called "eot" though...
        out = {'ids': ids, 'len': len(ids)}
        return out

    # tokenize the dataset
    tokenized = split_dataset.map(
        process,
        remove_columns=['text'],
        desc="tokenizing the splits",
        num_proc=num_proc,
    )


    # concatenate all the ids in each dataset into one large file we can use for training
    for split, dset in tokenized.items():
        arr_len = np.sum(dset['len'], dtype=np.uint64)
        filename = os.path.join(base_path, f'{split}.bin')
        dtype = np.uint16 # (can do since enc.max_token_value == 50256 is < 2**16)
        arr = np.memmap(filename, dtype=dtype, mode='w+', shape=(arr_len,))
        total_batches = 10

        idx = 0
        for batch_idx in tqdm(range(total_batches), desc=f'writing {filename}'):
            # Batch together samples for faster write
            batch = dset.shard(num_shards=total_batches, index=batch_idx, contiguous=True).with_format('numpy')
            arr_batch = np.concatenate(batch['ids'])
            # Write into mmap
            arr[idx : idx + len(arr_batch)] = arr_batch
            idx += len(arr_batch)
        arr.flush()

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Loading dataset shards:   0%|          | 0/83 [00:00<?, ?it/s]

writing /content/tokenised_weights/train.bin: 100%|██████████| 10/10 [07:37<00:00, 45.79s/it]
writing /content/tokenised_weights/val.bin: 100%|██████████| 10/10 [00:00<00:00, 11.34it/s]


In [10]:
import math
import inspect
from dataclasses import dataclass

import torch
import torch.nn as nn
from torch.nn import functional as F

class LayerNorm(nn.Module):
    def __init__(self, ndim, bias):
        super().__init__()
        self.weight = nn.Parameter(torch.ones(ndim))
        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None

    def forward(self, input):
        return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)

class SelfAttention(nn.Module):

    def __init__(self, n_embd, n_head, block_size, dropout, bias):
        super().__init__()

        # key, query, value projections
        self.c_attn_q = nn.Linear(n_embd, n_embd, bias=bias)
        self.c_attn_v = nn.Linear(n_embd, n_embd, bias=bias)
        self.c_attn_k = nn.Linear(n_embd, n_embd, bias=bias)

        # output projection
        self.c_proj = nn.Linear(n_embd, n_embd, bias=bias)

        # regularization
        self.attn_dropout = nn.Dropout(dropout)
        self.resid_dropout = nn.Dropout(dropout)
        self.n_head = n_head
        self.n_embd = n_embd
        self.dropout = dropout

        self.register_buffer("bias", torch.tril(torch.ones(block_size, block_size))
                                        .view(1, 1, block_size, block_size))

    def forward(self, x):
        B, T, C = x.shape

        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        q, k, v  = self.c_attn_q(x), self.c_attn_k(x), self.c_attn_v(x)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)

        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
        att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
        att = F.softmax(att, dim=-1)
        att = self.attn_dropout(att)
        y = att @ v
        y = y.transpose(1, 2).contiguous().view(B, T, C)

        # output projection
        y = self.resid_dropout(self.c_proj(y))
        return y

class MLP(nn.Module):
    def __init__(self, n_embd, bias, dropout):
        super().__init__()
        self.mlp = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd, bias=bias),
            nn.GELU(),
            nn.Linear(4 * n_embd, n_embd, bias=bias),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.mlp(x)

class Block(nn.Module):

    def __init__(self, block_size, vocab_size, n_layer, n_head, n_embd, dropout, bias):
        super().__init__()
        self.ln_1 = LayerNorm(n_embd, bias=bias)
        self.attn = SelfAttention(n_embd, n_head, block_size, dropout, bias)
        self.ln_2 = LayerNorm(n_embd, bias=bias)
        self.mlp = MLP(n_embd, bias, dropout)

    def forward(self, x):
        residual = x
        x = self.attn(self.ln_1(x))
        x = x + residual

        residual = x
        x = self.mlp(self.ln_2(x))
        x = x + residual

        return x

class GPT(nn.Module):

    def __init__(self, block_size=1024, vocab_size=50304, n_layer=12, n_head=12, n_embd=768,
                dropout=0, bias=True):
        super().__init__()
        self.block_size = block_size
        self.vocab_size = vocab_size
        self.n_layer = n_layer
        self.n_embd = n_embd
        self.dropout = dropout
        self.bias = bias

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(vocab_size, n_embd),
            wpe = nn.Embedding(block_size, n_embd),
            drop = nn.Dropout(dropout),
            h = nn.ModuleList([Block(block_size, vocab_size, n_layer, n_head, n_embd, dropout, bias) for _ in range(n_layer)]),
            ln_f = LayerNorm(n_embd, bias=bias),
        ))
        self.lm_head = nn.Linear(n_embd, vocab_size, bias=False)

        # Weight Tying
        self.transformer.wte.weight = self.lm_head.weight

        # weight initialisation
        self.apply(self._init_weights)

        # Scaled initialisation from GPT paper
        for pn, p in self.named_parameters():
            if pn.endswith('c_proj.weight'):
                torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * n_layer))

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None, return_all_logits=False):
        device = idx.device
        b, t = idx.shape
        pos = torch.arange(0, t, dtype=torch.long, device=device)

        tok_emb = self.transformer.wte(idx)
        pos_emb = self.transformer.wpe(pos)
        out = self.transformer.drop(tok_emb + pos_emb)
        for block in self.transformer.h:
            out = block(out)
        out = self.transformer.ln_f(out)

        if targets is not None:
            # Return cross entropy loss during training.
            logits = self.lm_head(out)
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
        else:
            if return_all_logits:
                # For fine-tuning tasks return all logits and compute custom loss.
                logits = self.lm_head(out)
            else:
                # Return just the last timestep during inference
                logits = self.lm_head(out[:, [-1], :])
            loss = None

        return logits, loss

    def configure_optimizers(self, weight_decay, learning_rate, betas, device_type):
        param_dict = {pn: p for pn, p in self.named_parameters() if p.requires_grad}

        # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
        # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
        optim_groups = [
            {'params': decay_params, 'weight_decay': weight_decay},
            {'params': nodecay_params, 'weight_decay': 0.0}
        ]

        kwargs = dict(fused=True)
        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, **kwargs)

        return optimizer

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx


In [11]:
!pip install wandb --quiet

In [12]:
import os
import time
import math
import pickle

import numpy as np
import torch
import wandb

# from model import GPT
# from config import config

In [25]:
config = {
    # General training flags
    'eval_interval' : 50,
    'log_interval' : 1,
    'eval_iters' : 5,
    'out_dir' : 'result',
    'eval_only' : False,
    'always_save_checkpoint' : True,
    'init_from' : 'scratch',

    # Wand flags
    'wandb_log' : True,
    'wandb_project' : 'hw5',
    'wandb_run_name' : 'final-pre-training',

    # Model gflags
    'gradient_accumulation_steps' : 5,
    'batch_size' : 6,
    'block_size' : 1024,
    'n_layer' : 8,
    'n_head' : 12,
    'n_embd' : 768,
    'dropout' : 0.0,
    'bias' : False,

    'learning_rate' : 1e-5,
    'max_iters' : 10000,
    'weight_decay' : 1e-1,
    'beta1' : 0.9,
    'beta2' : 0.95,
    'grad_clip' : 1.0,

    # Learning rate scheduler gflags
    'decay_lr' : True,
    'warmup_iters': 0,
    'lr_decay_iters': 10000,
    'min_lr' : 1e-6,
}

In [27]:
import gc
gc.collect() # These commands help you when you face CUDA OOM error
torch.cuda.empty_cache()

In [28]:
def save_checkpoint(model, optimizer, iter_num, filename='checkpoint.pth'):
    torch.save({
        'iter_num': iter_num,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }, filename)
    print(f"Checkpoint saved at iteration {iter_num}")

In [29]:
master_process = True
seed_offset = 0

for key, value in config.items():
    globals()[key] = value

gradient_accumulation_steps = config['gradient_accumulation_steps']
batch_size = config['batch_size']
block_size = config['block_size']
out_dir = config ['out_dir']
n_layer = config ['n_layer']


tokens_per_iter = gradient_accumulation_steps * batch_size * block_size
print(f"tokens per iteration will be: {tokens_per_iter:,}")

os.makedirs(out_dir, exist_ok=True)

torch.manual_seed(1337 + seed_offset)
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

device = 'cuda'
config['device'] = device
dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16'
config['dtype'] = dtype

device_type = 'cuda' if 'cuda' in device else 'cpu'

# note: float16 data type will automatically use a GradScaler
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = torch.amp.autocast(device_type=device_type, dtype=ptdtype)

data_dir = 'tokenised_weights'
train_data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r')
# Use only 70% of the training data.
train_data = train_data[:int(0.7*len(train_data))]
val_data = np.memmap(os.path.join(data_dir, 'val.bin'), dtype=np.uint16, mode='r')

# TODO; Implement a DataLoader for this
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
    y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
    if device_type == 'cuda':
        x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
    else:
        x, y = x.to(device), y.to(device)
    return x, y

iter_num = 0
best_val_loss = 1e9

model = GPT(n_layer= config['n_layer'], n_head=config['n_head'], n_embd=config['n_embd'], block_size=config['block_size'],
                  bias=config['bias'], dropout= config['dropout'])
model.to(device)

# initialize a GradScaler. If enabled=False scaler is a no-op
scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))
# optimizer
optimizer = model.configure_optimizers(config['weight_decay'], config['learning_rate'], (config['beta1'], config['beta2']), device_type)
# Load a model or a dictionary of states
checkpoint = torch.load('/content/result/checkpoint_iter_30000.pth')
# Assuming the checkpoint contains a model's state_dict
model.load_state_dict(checkpoint['model_state_dict'])
# If the checkpoint also contains an optimizer state
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
# scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, verbose=True)
unoptimized_model = model

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train','val']:
    # for split in ['train', 'val']:
        losses = torch.zeros(config['eval_iters'])
        for k in range(config['eval_iters']):
            X, Y = get_batch(split)
            with ctx:
                logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

# learning rate scheduler with decay and linear warmup according to the GPT paper.
def get_lr(it):
    if it < config['warmup_iters']:
        return config['learning_rate'] * it / config['warmup_iters']
    if it > config['lr_decay_iters']:
        return config['min_lr']
    decay_ratio = (it - config['warmup_iters']) / (config['lr_decay_iters'] - config['warmup_iters'])
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
    return config['min_lr'] + coeff * (config['learning_rate'] - config['min_lr'])

# logging
if config['wandb_log'] and master_process:
    import wandb
    wandb.login(key="176799c69976f0bd0ea0f167e585f7f3f2fbcd9f")
    wandb.init(project=config['wandb_project'], name=config['wandb_run_name'], config=config)

# training loop
X, Y = get_batch('train') # fetch the very first batch
t0 = time.time()
local_iter_num = 0 # number of iterations in the lifetime of this process
raw_model = model

while True:
    # determine and set the learning rate for this iteration
    lr = get_lr(iter_num) if config['decay_lr'] else config['learning_rate']
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr
    # evaluate the loss on train/val sets and write checkpoints

    if iter_num % config['eval_interval'] == 0 and master_process:
        losses = estimate_loss()
        # current_lr = optimizer.param_groups[0]['lr']
        current_lr = lr
        print(f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}, lr {current_lr:.6f}")
        # Wandb logging
        # scheduler.step(losses['val'])
        if config['wandb_log']:
            wandb.log({
                "iter": iter_num,
                "train/loss": losses['train'],
                "val/loss": losses['val'],
                "lr": current_lr,
            })

    if iter_num == 0 and config['eval_only']:
        break

    # if iter_num % 2 == 0 and iter_num != 0:
    #     context = torch.zeros((1, 1), dtype=torch.long, device=device)
    #     generated_tokens = model.generate(context, max_new_tokens=2000)[0].tolist()
    #     enc = tiktoken.get_encoding("gpt2")
    #     generated_text = enc.decode(generated_tokens)
    #     print(f"Generated text at step {iter_num}: {generated_text}")

    for micro_step in range(gradient_accumulation_steps):
        with ctx:
            logits, loss = model(X, Y)
            loss = loss / gradient_accumulation_steps

        X, Y = get_batch('train')
        scaler.scale(loss).backward()

    # Gradient Clipping
    if config['grad_clip'] != 0.0:
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), config['grad_clip'])

    scaler.step(optimizer)
    scaler.update()
    optimizer.zero_grad(set_to_none=True)
    if iter_num % 100 == 0:
      save_checkpoint(model, optimizer, iter_num, filename=os.path.join(out_dir, f'checkpoint_iter_{iter_num}.pth'))

    # timing and logging
    t1 = time.time()
    dt = t1 - t0
    t0 = t1
    if iter_num % config['log_interval'] == 0 and master_process:
        current_lr = optimizer.param_groups[0]['lr']
        lossf = loss.item() * gradient_accumulation_steps
        print(f"iter {iter_num}: loss {lossf:.4f}, time {dt*1000:.2f}ms, lr {current_lr:.6f}" )
    iter_num += 1
    local_iter_num += 1

    if iter_num > config['max_iters']:
        break

tokens per iteration will be: 30,720




VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
iter,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
lr,███████▇▇▇▇▇▇▆▆▆▅▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁
train/loss,▇██▆▅▅▆▆▅▅▆▆▅▆▄▃▅▆▅▅▃▃▄▄▂▂▂▁▂▂▃▁▂▃▄▃▂▁▂▂
val/loss,█▇▆▇▆▅▆▆▅▅▆▅▆▆▅▅▅▅▄▃▆▄▃▄▃▂▃▂▄▃▁▄▃▂▂▂▁▁▄▁

0,1
iter,30000.0
lr,1e-05
train/loss,3.45689
val/loss,3.43262


step 0: train loss 3.3499, val loss 3.3847, lr 0.000010
Checkpoint saved at iteration 0
iter 0: loss 3.5493, time 6493.41ms, lr 0.000010
iter 1: loss 3.3036, time 1791.12ms, lr 0.000010
iter 2: loss 3.2740, time 2473.05ms, lr 0.000010
iter 3: loss 3.3075, time 2507.87ms, lr 0.000010
iter 4: loss 3.0703, time 2526.37ms, lr 0.000010
iter 5: loss 2.9618, time 2510.54ms, lr 0.000010
iter 6: loss 3.2783, time 2510.38ms, lr 0.000010
iter 7: loss 3.5350, time 2490.19ms, lr 0.000010
iter 8: loss 3.4718, time 2480.19ms, lr 0.000010
iter 9: loss 3.4043, time 2469.94ms, lr 0.000010
iter 10: loss 3.3374, time 2449.13ms, lr 0.000010
iter 11: loss 3.5556, time 2422.11ms, lr 0.000010
iter 12: loss 3.5128, time 2409.85ms, lr 0.000010
iter 13: loss 3.4091, time 2407.17ms, lr 0.000010
iter 14: loss 3.5279, time 2395.70ms, lr 0.000010
iter 15: loss 3.3942, time 2412.32ms, lr 0.000010
iter 16: loss 3.4968, time 2375.51ms, lr 0.000010
iter 17: loss 3.3006, time 2383.84ms, lr 0.000010
iter 18: loss 3.2535, 

KeyboardInterrupt: 

In [None]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(enc.decode(model.generate(context, max_new_tokens=1024)[0].tolist()))

# context = torch.zeros((1, 1), dtype=torch.long, device=device)
# generated_tokens = model.generate(context, max_new_tokens=2000)[0].tolist()
# generated_text = enc.decode(generated_tokens)
# print(f"Generated text at step {iter_num}: {generated_text}")

In [None]:
#FINE TUNING
