In [1]:
import time
import torch
import torch.nn as nn
import numpy as np
from model import GPT, GPTConfig, SentenceGenerator
from context_free_grammar import CFG
import wandb

In [1]:
wandb.login()

NameError: name 'wandb' is not defined

In [3]:
cfg = CFG(L=3, ns=[1, 3, 9, 10], nr=[2, 2, 2], T=[8, 8, 8])
sentence_length = np.prod(cfg.T)

In [4]:
config = GPTConfig(vocab_size=cfg.ns[-1], n_embd=384, n_head=6, n_layer=6)
m = GPT(config)
m = nn.DataParallel(m)
m.to(config.device)

number of parameters: 10.64M


DataParallel(
  (module): GPT(
    (transformer): ModuleDict(
      (wte): Embedding(10, 384)
      (wpe): Embedding(256, 384)
      (drop): Dropout(p=0.0, inplace=False)
      (h): ModuleList(
        (0-5): 6 x Block(
          (ln_1): LayerNorm()
          (attn): MultiHeadAttention(
            (heads): ModuleList(
              (0-5): 6 x Head(
                (key): Linear(in_features=384, out_features=64, bias=False)
                (query): Linear(in_features=384, out_features=64, bias=False)
                (value): Linear(in_features=384, out_features=64, bias=False)
                (dropout): Dropout(p=0.0, inplace=False)
              )
            )
            (proj): Linear(in_features=384, out_features=384, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (ln_2): LayerNorm()
          (mlp): MLP(
            (c_fc): Linear(in_features=384, out_features=1536, bias=True)
            (gelu): GELU(approximate='none')
            (c_proj)

In [5]:
 # print the number of parameters in the model
print(sum(p.numel() for p in m.parameters()) / 1e6, "M parameters")

10.742784 M parameters


In [6]:
# data loading = sample new sentences to fill-in the mini-batch
def get_batch(config: GPTConfig = GPTConfig()):
    data, label = cfg.sample(config.batch_size)
    N = data.shape[0] # should be equal to config.batch_size
    data = data.view(N,sentence_length) # flatten them out to be (N,sentence_length)# reshape in a 1d tensor
    # generate a small batch of data of inputs x and targets y
    x = data[:, 0:sentence_length-1]               # (bsz,sentence_length-1)
    y = data[:, 1:sentence_length].contiguous()    # (bsz,sentence_length-1)
    x, y = x.to(config.device), y.to(config.device)
    return x, y

In [7]:
@torch.no_grad()
def estimate_loss(m, eval_iters):
    # This function samples a new batch of sentences and evaluates the loss of the model
    out = {}
    m.eval()
    losses = torch.zeros(eval_iters)
    for k in range(eval_iters):
        X, Y = get_batch()
        logits = m(X)
        loss = nn.functional.cross_entropy(logits.view(-1, logits.size(-1)), Y.view(-1), ignore_index=-1)
        losses[k] = loss.item()
    out["val"] = losses.mean()
    m.train()
    return out

In [8]:
context_length = 3
@torch.no_grad()
def estimate_grammar_err_multipleGPU(m, n_gen=25):
    m.eval()
    # generate n_gen sentences on each GPU from the model and check their correctness -> 4*n_gen generated in total
    # for generating sentences from the model, we first sample a real sentence from the grammar
    # then, the model is given the first 'context_length' tokens and asked to complete the sentence
    # Returns the number of sentence correct (with 0 mistake) at each level
    error_per_sentence = []    
    for i in range(n_gen):
        context = cfg.sample_flattened(1)[0][0][:,:context_length].expand(4,context_length).to(config.device)  
        parallel_generator = nn.DataParallel(SentenceGenerator(m.module, context, max_new_tokens=sentence_length-3))
        gen_sentences = parallel_generator()
        for sentence in gen_sentences:
            _, err = cfg.collapse_and_get_err(sentence.view(*cfg.T).cpu())
            mistakes = []
            for level_errors in err:
                mistakes.append(torch.count_nonzero(level_errors).detach().numpy())
            error_per_sentence.append(np.array(mistakes))
    error_per_sentence = np.array(error_per_sentence)
    # compute number of sentence that are correct at each level of the grammar
    res = []
    for l in range(cfg.L):
        nb_correct = (4*n_gen - np.count_nonzero(error_per_sentence[:,l]))
        res.append(nb_correct)
    m.train()
    return np.array(res)

In [9]:
training_parameters = {'num_epoch': 1000, # large number as we don't know yet what time it will take
                       'batches_per_epoch' : 100,
                       'eval_iters' : 500,
                       'quality_metric_iters' : 125, # 1000 sentences in total generated at each val step
                       'learning_rate' : 1e-5,
                       'architecture':"GPT 10M",
                       'grammar': cfg.__str__(),
                       'batch_size':config.batch_size,}
training_parameters['optimizer'] = torch.optim.AdamW(m.parameters(), lr=training_parameters['learning_rate'])

# start at 1e-5 and increase by 1e-5 every 5 epochs until 1e-4 is reached
lambda_lr = lambda epoch: (epoch//5 + 1) if epoch < 50 else 10
#scheduler = torch.optim.lr_scheduler.LambdaLR(training_parameters['optimizer'], lr_lambda=[lambda_lr])

max_lr = 1e-3  # Maximum learning rate
min_lr = 1e-6  # Minimum learning rate
total_epochs = training_parameters['num_epoch'] # Total number of epochs
div_factor = 1e2  # LR max / LR start
final_div_factor = 1e3  # LR max / LR end
scheduler = torch.optim.lr_scheduler.OneCycleLR(
    training_parameters['optimizer'],
    max_lr=max_lr,
    total_steps=total_epochs,
    pct_start=0.2,
    div_factor=div_factor,
    final_div_factor=final_div_factor
)

In [10]:
# Training loop
def train(m):
    print(f'One epoch is {training_parameters["batches_per_epoch"]} steps,' +
    f'validation loss is computed at the end of every epoch and quality metric is '+
    f'averaged over {4*training_parameters["quality_metric_iters"]} sentences, computed every 250 epochs')
    print(f'Will run for {training_parameters["num_epoch"]} epochs')
    for epoch in range(training_parameters['num_epoch']):
        train_losses = []
        for iter in range(training_parameters['batches_per_epoch']):
            # sample a batch of data
            xb, yb = get_batch()
            # evaluate the loss
            logits = m(xb)
            training_parameters['optimizer'].zero_grad(set_to_none=True)
            loss = nn.functional.cross_entropy(logits.view(-1, logits.size(-1)), yb.view(-1), ignore_index=-1)
            train_losses.append(loss.item())
            loss.backward()
            training_parameters['optimizer'].step()
        # evaluate the loss on newly generated sentences at the end of every epoch
        
        val_loss = estimate_loss(m, training_parameters["eval_iters"])['val']
        print(
            f'epoch {epoch}: val loss {val_loss:.4f}'
        )
        log_dict = {"nb sentences seen": epoch*training_parameters['batches_per_epoch']*config.batch_size,
                    "loss": val_loss,
                    "learning_rate": training_parameters['optimizer'].param_groups[0]["lr"]}
        if epoch % 250 == 0:
            errors = estimate_grammar_err_multipleGPU(m, training_parameters['quality_metric_iters'])
            print(
                f'epoch {epoch}: correct sentences for each level{errors}'
            )
            for i,err in enumerate(errors):
               log_dict[f'% of correct sentences at level {i}'] = err/(4*training_parameters['quality_metric_iters']) * 100
        wandb.log(log_dict)
        scheduler.step()

In [None]:
wandb.init(project='CFG-experiments',config=training_parameters, name='GPT 10M')
wandb.watch(m, log='all')

train(m)
wandb.finish()

One epoch is 100 steps,validation loss is computed at the end of every epoch and quality metric isaveraged over {4*training_parameters["quality_metric_iters"]} sentences, computed every 250 epochs
Will run for 1000 epochs
epoch 0: val loss 2.1888
epoch 0: correct sentences for each level[1 0 0]
epoch 1: val loss 2.0254
epoch 2: val loss 1.9937
epoch 3: val loss 1.9699
epoch 4: val loss 1.9521
epoch 5: val loss 1.9265
epoch 6: val loss 1.8885
epoch 7: val loss 1.8483
epoch 8: val loss 1.8041
epoch 9: val loss 1.7249
epoch 10: val loss 1.6269
epoch 11: val loss 1.4246
epoch 12: val loss 1.1628
epoch 13: val loss 0.8711
epoch 14: val loss 0.6674
epoch 15: val loss 0.4737
epoch 16: val loss 0.3996
epoch 17: val loss 0.3551
epoch 18: val loss 0.3243
epoch 19: val loss 0.2791
epoch 20: val loss 0.2757
epoch 21: val loss 0.2459
epoch 22: val loss 0.2273
epoch 23: val loss 0.2249
epoch 24: val loss 0.2135
epoch 25: val loss 0.1913
epoch 26: val loss 0.1869
epoch 27: val loss 0.1873
epoch 28: v

# GPT 2 with 85M parameters

In [None]:
torch.cuda.empty_cache()

In [None]:
# New experiment with larger model and same grammar
config = GPTConfig(vocab_size=cfg.ns[-1], n_embd=768, n_head=12, n_layer=12)
m_large = GPT(config)
m_large = nn.DataParallel(m_large)
m_large.to(config.device)

In [None]:
print(sum(p.numel() for p in m_large.parameters()) / 1e6, "M parameters")

In [None]:
training_parameters = {'max_iters' : 15000,
                       'eval_interval' : 500,
                       'eval_iters' : 50,
                       'quality_metric_iters' : 50,
                       'learning_rate' : 1e-4,
                       'architecture':"GPT 85.04M",
                       'grammar': cfg.__str__(),
                       'batch_size':config.batch_size,}
training_parameters['optimizer'] = torch.optim.AdamW(m_large.parameters(), lr=training_parameters['learning_rate'])
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(training_parameters['optimizer'], mode='min', patience=2, factor=0.1) # Divide lr by 10

In [None]:
wandb.init(project='CFG-experiments',config=training_parameters)
wandb.watch(m_large, log='all', log_freq=1)

train(m_large)
wandb.finish()