In [6]:
import time
import torch
import torch.nn as nn
import numpy as np
from model import GPT, GPTConfig, SentenceGenerator
from context_free_grammar import CFG
import wandb

In [7]:
wandb.login()

True

In [8]:
cfg = CFG(L=3, ns=[1, 3, 9, 10], nr=[2, 2, 2], T=[8, 8, 8])
sentence_length = np.prod(cfg.T)

In [9]:
config = GPTConfig(vocab_size=cfg.ns[-1], n_embd=384, n_head=6, n_layer=6)
m = GPT(config)
m = nn.DataParallel(m)
m.to(config.device)

number of parameters: 10.64M


DataParallel(
  (module): GPT(
    (transformer): ModuleDict(
      (wte): Embedding(10, 384)
      (wpe): Embedding(256, 384)
      (drop): Dropout(p=0.0, inplace=False)
      (h): ModuleList(
        (0-5): 6 x Block(
          (ln_1): LayerNorm()
          (attn): MultiHeadAttention(
            (heads): ModuleList(
              (0-5): 6 x Head(
                (key): Linear(in_features=384, out_features=64, bias=False)
                (query): Linear(in_features=384, out_features=64, bias=False)
                (value): Linear(in_features=384, out_features=64, bias=False)
                (dropout): Dropout(p=0.0, inplace=False)
              )
            )
            (proj): Linear(in_features=384, out_features=384, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (ln_2): LayerNorm()
          (mlp): MLP(
            (c_fc): Linear(in_features=384, out_features=1536, bias=True)
            (gelu): GELU(approximate='none')
            (c_proj)

In [10]:
 # print the number of parameters in the model
print(sum(p.numel() for p in m.parameters()) / 1e6, "M parameters")

10.742784 M parameters


In [11]:
# data loading = sample new sentences to fill-in the mini-batch
def get_batch(config: GPTConfig = GPTConfig()):
    sentence = cfg.sample_flattened(1)[0][0].view(sentence_length)  # reshape in a 1d tensor
    # generate a small batch of data of inputs x and targets y
    ix = torch.randint(0, sentence_length - config.block_size, size=(config.batch_size,))
    x = torch.stack([sentence[i: i + config.block_size] for i in ix])
    y = torch.stack([sentence[i+1: i + config.block_size + 1] for i in ix])
    x, y = x.to(config.device), y.to(config.device)
    return x, y

In [12]:
@torch.no_grad()
def estimate_loss(m, eval_iters):
    # This function samples a new batch of sentences and evaluates the loss of the model
    out = {}
    m.eval()
    losses = torch.zeros(eval_iters)
    for k in range(eval_iters):
        X, Y = get_batch()
        logits = m(X)
        loss = nn.functional.cross_entropy(logits.view(-1, logits.size(-1)), Y.view(-1), ignore_index=-1)
        losses[k] = loss.item()
    out["val"] = losses.mean()
    m.train()
    return out

In [13]:
context_length = 3
@torch.no_grad()
def estimate_grammar_err_multipleGPU(m, n_gen=25):
    m.eval()
    # generate n_gen sentences on each GPU from the model and check their correctness -> 4*n_gen generated in total
    # for generating sentences from the model, we first sample a real sentence from the grammar
    # then, the model is given the first 'context_length' tokens and asked to complete the sentence
    # Returns the number of sentence correct (with 0 mistake) at each level
    error_per_sentence = []    
    for i in range(n_gen):
        context = cfg.sample_flattened(1)[0][0][:,:context_length].expand(4,context_length).to(config.device)  
        parallel_generator = nn.DataParallel(SentenceGenerator(m.module, context, max_new_tokens=sentence_length-3))
        gen_sentences = parallel_generator()
        for sentence in gen_sentences:
            _, err = cfg.collapse_and_get_err(sentence.view(*cfg.T).cpu())
            mistakes = []
            for level_errors in err:
                mistakes.append(torch.count_nonzero(level_errors).detach().numpy())
            error_per_sentence.append(np.array(mistakes))
    error_per_sentence = np.array(error_per_sentence)
    # compute number of sentence that are correct at each level of the grammar
    res = []
    for l in range(cfg.L):
        nb_correct = (4*n_gen - np.count_nonzero(error_per_sentence[:,l]))
        res.append(nb_correct)
    m.train()
    return np.array(res)

In [14]:
context_length = 3
@torch.no_grad()
def estimate_grammar_err(m, n_gen=100):
    start = time.time()
    m.eval()
    model = m.module
    # generate n_gen sentences from the model and check their correctness
    # for generating sentences from the model, we first sample a real sentence from the grammar
    # then, the model is given the first 'context_length' tokens and asked to complete the sentence
    # Returns the number of sentence correct (with 0 mistake) at each level
    error_per_sentence = []    
    for i in range(n_gen):
        mistakes = []
        context = cfg.sample_flattened(1)[0][0][:,:context_length].to(config.device)
        gen_sentence = m.module.generate(context.reshape(1,context_length), max_new_tokens=sentence_length-context_length)[0].view(-1,1)
        _, err = cfg.collapse_and_get_err(gen_sentence.view(*cfg.T).cpu())
        for level_errors in err:
            mistakes.append(torch.count_nonzero(level_errors).detach().numpy())
        error_per_sentence.append(np.array(mistakes))
    error_per_sentence = np.array(error_per_sentence)
    # compute number of sentence that are correct at each level of the grammar
    res = []
    for l in range(cfg.L):
        nb_correct = (n_gen*4 - np.count_nonzero(error_per_sentence[:,l]))
        res.append(nb_correct)
    m.train()
    return np.array(res)

In [20]:
training_parameters = {'num_epoch': 500, # large number as we don't know yet what time it will take
                       'batches_per_epoch' : 40,
                       'eval_iters' : 500,
                       'quality_metric_iters' : 125, # 1000 sentences in total generated at each val step
                       'learning_rate' : 1e-5,
                       'architecture':"GPT 10M",
                       'grammar': cfg.__str__(),
                       'batch_size':config.batch_size,}
training_parameters['optimizer'] = torch.optim.AdamW(m.parameters(), lr=training_parameters['learning_rate'])

# start at 1e-5 and increase by 1e-5 every 5 epochs until 1e-4 is reached
lambda_lr = lambda epoch: (epoch//5 + 1) if epoch < 50 else 10
#scheduler = torch.optim.lr_scheduler.LambdaLR(training_parameters['optimizer'], lr_lambda=[lambda_lr])

max_lr = 1e-3  # Maximum learning rate
min_lr = 1e-6  # Minimum learning rate
total_epochs = training_parameters['num_epoch'] # Total number of epochs
div_factor = 1e2  # LR max / LR start
final_div_factor = 1e3  # LR max / LR end
scheduler = torch.optim.lr_scheduler.OneCycleLR(
    training_parameters['optimizer'],
    max_lr=max_lr,
    total_steps=total_epochs,
    pct_start=0.2,
    div_factor=div_factor,
    final_div_factor=final_div_factor
)

In [21]:
# Training loop
def train(m):
    print(f'One epoch is {training_parameters["batches_per_epoch"]} steps,\
    validation is run at the end of every epoch and metrics are averaged over {4*training_parameters["quality_metric_iters"]} sentences')
    print(f'Will run for {training_parameters["num_epoch"]} epochs')
    for epoch in range(training_parameters['num_epoch']):
        train_losses = []
        for iter in range(training_parameters['batches_per_epoch']):
            # sample a batch of data
            xb, yb = get_batch()
            # evaluate the loss
            logits = m(xb)
            training_parameters['optimizer'].zero_grad(set_to_none=True)
            loss = nn.functional.cross_entropy(logits.view(-1, logits.size(-1)), yb.view(-1), ignore_index=-1)
            train_losses.append(loss.item())
            loss.backward()
            training_parameters['optimizer'].step()
        # evaluate the loss on newly generated sentences at the end of every epoch
        if epoch % 50 == 0:
            val_loss = estimate_loss(m, training_parameters["eval_iters"])['val']
            print(
                f'epoch {epoch}: val loss {val_loss:.4f}'
            )
            
            errors = estimate_grammar_err_multipleGPU(m, training_parameters['quality_metric_iters'])
            print(
                f'epoch {epoch}: correct sentences for each level{errors}'
            )
            log_dict = {"nb sentences seen": epoch*training_parameters['batches_per_epoch']*config.batch_size,
                        "loss": val_loss,
                        "learning_rate": training_parameters['optimizer'].param_groups[0]["lr"]}
            for i,err in enumerate(errors):
               log_dict[f'% of correct sentences at level {i}'] = err/(4*training_parameters['quality_metric_iters']) * 100
            wandb.log(log_dict)
        scheduler.step()

In [None]:
wandb.init(project='CFG-experiments',config=training_parameters, name='GPT 10M')
wandb.watch(m, log='all')

train(m)
wandb.finish()

VBox(children=(Label(value='0.008 MB of 0.008 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
% of correct sentences at level 0,▁▁▁▁▁█▁▁
% of correct sentences at level 1,▁▁▁▁▁▁▁▁
% of correct sentences at level 2,▁▁▁▁▁▁▁▁
learning_rate,▁▁▂▂▃▅▆█
loss,█▇▄▃▃▂▂▁
nb sentences seen,▁▂▃▄▅▆▇█

0,1
% of correct sentences at level 0,0.0
% of correct sentences at level 1,0.0
% of correct sentences at level 2,0.0
learning_rate,6e-05
loss,1.88547
nb sentences seen,71680.0


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01111361775547266, max=1.0)…

One epoch is 40 steps,    validation is run at the end of every epoch and metrics are averaged over 125 sentences
Will run for 500 epochs
epoch 0: val loss 1.7920


# GPT 2 with 85M parameters

In [None]:
torch.cuda.empty_cache()

In [None]:
# New experiment with larger model and same grammar
config = GPTConfig(vocab_size=cfg.ns[-1], n_embd=768, n_head=12, n_layer=12)
m_large = GPT(config)
m_large = nn.DataParallel(m_large)
m_large.to(config.device)

In [None]:
print(sum(p.numel() for p in m_large.parameters()) / 1e6, "M parameters")

In [None]:
training_parameters = {'max_iters' : 15000,
                       'eval_interval' : 500,
                       'eval_iters' : 50,
                       'quality_metric_iters' : 50,
                       'learning_rate' : 1e-4,
                       'architecture':"GPT 85.04M",
                       'grammar': cfg.__str__(),
                       'batch_size':config.batch_size,}
training_parameters['optimizer'] = torch.optim.AdamW(m_large.parameters(), lr=training_parameters['learning_rate'])
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(training_parameters['optimizer'], mode='min', patience=2, factor=0.1) # Divide lr by 10

In [None]:
wandb.init(project='CFG-experiments',config=training_parameters)
wandb.watch(m_large, log='all', log_freq=1)

train(m_large)
wandb.finish()