In [1]:
import math
import torch
import torch.nn as nn
import numpy as np
from model import GPT, GPTConfig
from context_free_grammar import CFG
import wandb

In [2]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33maboitrea[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

### Define Context-Free grammar and model

In [3]:
cfg = CFG(L=3, ns=[1, 3, 9, 10], nr=[2, 2, 2], T=[8, 8, 8])
sentence_length = np.prod(cfg.T)

In [4]:
config = GPTConfig(vocab_size=cfg.ns[-1],
                   block_size=sentence_length-1,
                   n_embd=384, n_head=6,
                   n_layer=6,
                   batch_size=100)
m = GPT(config)
m = nn.DataParallel(m)
m.to(config.device)

number of parameters: 10.63M


DataParallel(
  (module): GPT(
    (transformer): ModuleDict(
      (wte): Embedding(10, 384)
      (wpe): Embedding(511, 384)
      (drop): Dropout(p=0.0, inplace=False)
      (h): ModuleList(
        (0): Block(
          (ln_1): LayerNorm()
          (attn): MultiHeadAttention(
            (heads): ModuleList(
              (0): Head(
                (key): Linear(in_features=384, out_features=64, bias=False)
                (query): Linear(in_features=384, out_features=64, bias=False)
                (value): Linear(in_features=384, out_features=64, bias=False)
                (dropout): Dropout(p=0.0, inplace=False)
              )
              (1): Head(
                (key): Linear(in_features=384, out_features=64, bias=False)
                (query): Linear(in_features=384, out_features=64, bias=False)
                (value): Linear(in_features=384, out_features=64, bias=False)
                (dropout): Dropout(p=0.0, inplace=False)
              )
              (2): Head(


In [5]:
 # print the number of parameters in the model
million_params = sum(p.numel() for p in m.parameters()) / 1e6
print(million_params, "M parameters")

10.824192 M parameters


### Define some useful functions for training/validation steps

In [6]:
# data loading = sample new sentences to fill-in the mini-batch
def get_batch(config: GPTConfig = GPTConfig()):
    data, _ = cfg.sample(config.batch_size) # wasting labels (useless for the task)
    N = data.shape[0] # should be equal to config.batch_size
    data = data.view(N,sentence_length) # flatten them out to be (N,sentence_length)# reshape in a 1d tensor
    # generate a batch of data of inputs x and targets y
    x = data[:, 0:sentence_length-1]               # (bsz,sentence_length-1)
    y = data[:, 1:sentence_length].contiguous()    # (bsz,sentence_length-1)
    x, y = x.to(config.device), y.to(config.device)
    return x, y

In [7]:
@torch.no_grad()
def estimate_loss(m, eval_iters=100):
    # This validation function samples a new batch of sentences and evaluates the loss of the model
    # Takes 20s for 100 sentences
    m.eval()
    losses = torch.zeros(eval_iters)
    for k in range(eval_iters):
        X, Y = get_batch()
        logits = m(X)
        loss = nn.functional.cross_entropy(logits.view(-1, logits.size(-1)), Y.view(-1), ignore_index=-1)
        losses[k] = loss.item()
    return losses.mean().item()

In [8]:
@torch.no_grad()
def eval_errors(m, n_gen=100, context_length=3):
    # for generating sentences from the model, we first sample real sentences from the grammar
    # then, the model is given the first 'context_length' symbols and asked to complete the sentence
    # Takes 40s for 100 sentences
    if isinstance(m, nn.DataParallel):
        m = m.module
    
    m.eval()
    context = cfg.sample(n_gen)[0].view(n_gen, sentence_length)[:,:context_length].to(config.device)
    gen_sentences = m.generate(context, max_new_tokens= sentence_length - context_length, temperature=0.1)

    # compute accuracy 
    gen_sentences = gen_sentences.view([n_gen] +  cfg.T).cpu()
    acc = cfg.frac_of_gramatically_correct_sentences(gen_sentences)  
    
    # compute per-level errors
    # a sentence can only be good at level i if it was good at all levels beteewn L and i+1
    correct_sentences = np.zeros(cfg.L)
    for sentence in gen_sentences:
        _, err = cfg.collapse_and_get_err(sentence)
        
        for i in range(len(err)-1,-1, -1):
            if err[i].sum() != 0:
                break
            else:
                correct_sentences[i] += 1
                
    return acc, np.array(correct_sentences) / n_gen * 100

### Learning rate scheduler

In [9]:
def get_lr(i,i_final):
    coeff = 0.5 * (1.0 + math.cos(math.pi * i/i_final)) # decays from 1 to 0 
    return min_lr + coeff * (max_lr - min_lr)

### Training parameters

In [10]:
# adamw optimizer
max_lr = 6e-4 # max learning rate
min_lr = max_lr/10
decay_lr = True

weight_decay = 1e-1
beta1 = 0.9
beta2 = 0.95
grad_clip = 1.0 # clip gradients at this value, or disable if == 0.0

optimizer = m.module.configure_optimizers(weight_decay, max_lr, (beta1, beta2), device_type='cuda')

num decayed parameter tensors: 128, with 10,816,896 parameters
num non-decayed parameter tensors: 19, with 7,296 parameters


In [11]:
training_parameters = {'num_epoch': 50,
                       'batches_per_epoch': 100,
                       'eval_iters': 100,
                       'quality_metric_iters': 100,
                       'learning_rate': 6e-4, # Start lr
                       'architecture': f'GPT {million_params:.1f}M',
                       'grammar': cfg.__str__(),
                       'batch_size':config.batch_size,}

In [12]:
# Training loop
# 1 epoch (train + val is 1m20s)
def train(m):
    print(f'One epoch is {training_parameters["batches_per_epoch"]} steps,' +
    f'validation loss is computed at the end of every epoch and quality metric is '+
    f'averaged over {training_parameters["quality_metric_iters"]} sentences')
    print(f'Will run for {training_parameters["num_epoch"]} epochs')
    total_num_iter = training_parameters['num_epoch'] * training_parameters['batches_per_epoch']
    for epoch in range(training_parameters['num_epoch']):
        train_loss_sum = .0
        m.train()
        for iter in range(training_parameters['batches_per_epoch']):
            # determine and set the learning rate for this iteration
            current_global_iter = iter + epoch * training_parameters['batches_per_epoch']
            lr = get_lr(current_global_iter, total_num_iter) if decay_lr else max_lr
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr
                
            # sample a batch of data
            xb, yb = get_batch(config)
            # evaluate the loss
            optimizer.zero_grad()
            logits = m(xb)
            loss = nn.functional.cross_entropy(logits.view(-1, logits.size(-1)), yb.view(-1), ignore_index=-1)
            train_loss_sum += loss.item()
            loss.backward()
            optimizer.step()
        # evaluate the loss on newly generated sentences at the end of every epoch
        train_loss = train_loss_sum / config.batch_size
        val_loss = estimate_loss(m, training_parameters["eval_iters"])
        acc, errors = eval_errors(m, training_parameters['quality_metric_iters'], context_length=3)
        log_dict = {"nb sentences seen": epoch*training_parameters['batches_per_epoch']*config.batch_size,
                    "val_loss": val_loss,
                    "train_loss": train_loss,
                    "accuracy": acc * 100,
                    "learning_rate": optimizer.param_groups[0]["lr"]}
        for i,err in enumerate(errors):
            log_dict[f'% of correct sentences at level {i}'] = err
            
        print(log_dict)
        wandb.log(log_dict)
        #scheduler.step()

In [13]:
wandb.init(project='CFG',config=training_parameters, name=f'GPT {million_params:.1f}M temp=0.1')
wandb.watch(m, log='all')

train(m)
wandb.finish()

One epoch is 100 steps,validation loss is computed at the end of every epoch and quality metric is averaged over 100 sentences
Will run for 50 epochs


NameError: name 'math' is not defined

In [None]:
max_lr = 6e-4  # Maximum learning rate
min_lr = 1e-6  # Minimum learning rate
total_epochs = training_parameters['num_epoch'] # Total number of epochs
div_factor = 1e2  # LR max / LR start
final_div_factor = 1e3  # LR max / LR end
scheduler = torch.optim.lr_scheduler.OneCycleLR(
    training_parameters['optimizer'],
    max_lr=max_lr,
    total_steps=total_epochs,
    pct_start=0.2,
    div_factor=div_factor,
    final_div_factor=final_div_factor
)