In [2]:
%load_ext autoreload
%autoreload 2

import os
import sys

import torch
from torch.utils.data import DataLoader
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm
import yaml

module_path = os.path.abspath(os.path.join('./utils')) # or the path to your source code
sys.path.insert(0, module_path)

from utils.exp_util import setup_exp_folders, get_pretrained_name, init_model
from utils.data_util import get_datasets

In [2]:
if torch.cuda.is_available(): 
 dev = "cuda:0" 
else: 
 dev = "cpu" 
device = torch.device(dev) 
print(f"Device: {device}")

Device: cuda:0


## Config

In [3]:
EXPERIMENT_NAME = "small_no_context"

with open(f"./config/{EXPERIMENT_NAME}.yaml", 'r') as file:
	config = yaml.safe_load(file)
print(config)

{'GPT_SIZE': 'small', 'TRAIN_SPLIT': 0.75, 'VAL_SPLIT': 0.15, 'TEST_SPLIT': 0.1, 'EPOCHS': 1, 'BATCH_SIZE': 8, 'LR': 5e-05, 'WARMUP_STEP': 100, 'GRADIENT_ACCUMULATION_STEPS': 32, 'MAX_GRAD_NORM': 1, 'RANDOM_SEED': 42}


## Load Dataset

In [4]:
from transformers import GPT2Tokenizer

gpt_type = get_pretrained_name(config['GPT_SIZE'])
special_tokens_dict = {'pad_token': '<|pad|>', 'bos_token': '<|startoftext|>', 'eos_token': '<|endoftext|>'}

tokenizer = GPT2Tokenizer.from_pretrained(gpt_type)
tokenizer.add_special_tokens(special_tokens_dict)

2

In [5]:
train_dataset, val_dataset, test_dataset = get_datasets(config, tokenizer)

print(f"Train: {len(train_dataset)}, Val: {len(val_dataset)}, Test: {len(test_dataset)}")
train_dataset.df.head()

10000 articles loaded.
9022 samples after cleaning
Train: 6766, Val: 1354, Test: 902


Unnamed: 0,title,description,title_tokens,description_tokens
2173,Kuehne + Nagel International : Analyst year-en...,(marketscreener.com) Consolidated Financial St...,"[K, ue, h, ne, Ġ+, ĠNag, el, ĠInternational, Ġ...","[(, markets, cre, ener, ., com, ), ĠConsolid, ..."
7134,PDT Partners LLC Cuts Stock Position in ABM In...,PDT Partners LLC Cuts Stock Position in ABM In...,"[PD, T, ĠPartners, ĠLLC, ĠC, uts, ĠStock, ĠPos...","[PD, T, ĠPartners, ĠLLC, ĠC, uts, ĠStock, ĠPos..."
7292,PDRs as an instrument for capital raising,Raising capital and expanding foreign investme...,"[PD, Rs, Ġas, Ġan, Ġinstrument, Ġfor, Ġcapital...","[Ra, ising, Ġcapital, Ġand, Ġexpanding, Ġforei..."
573,"Paul Vallas, former schools CEO backed by poli...","Paul Vallas, former schools CEO backed by poli...","[Paul, ĠV, allas, ,, Ġformer, Ġschools, ĠCEO, ...","[Paul, ĠV, allas, ,, Ġformer, Ġschools, ĠCEO, ..."
7430,Abercrombie & Fitch Co. Reports Fourth Quarter...,(marketscreener.com) Delivers fourth quarter n...,"[A, ber, c, rom, bie, Ġ&, ĠF, itch, ĠCo, ., ĠR...","[(, markets, cre, ener, ., com, ), ĠDel, ivers..."


In [6]:
max(train_dataset.df.description_tokens.map(len))

157

## Train

In [10]:
from torch.utils.tensorboard import SummaryWriter


In [11]:
def train(model, train_dataset, valid_dataset, config, checkpoint_every=0):
    log_dir, model_dir = setup_exp_folders(EXPERIMENT_NAME)
    writer = SummaryWriter(log_dir)

    train_dataloader = DataLoader(train_dataset,batch_size=config["BATCH_SIZE"])
    EPOCHS = config["EPOCHS"]
    GRADIENT_ACCUMULATION_STEPS = config["GRADIENT_ACCUMULATION_STEPS"]
    
    optimizer = AdamW(model.parameters(),lr=config["LR"])
    total_steps = len(train_dataloader) * EPOCHS
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=config["WARMUP_STEP"], num_training_steps=total_steps
    )

    model.train()
    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    
    for epoch in range(1, EPOCHS+1):
        print(f"Epoch {epoch}")
        for step, batch in enumerate(tqdm(train_dataloader)):
            b_input_ids = batch[0].to(device)
            b_labels = batch[0].to(device)
            b_masks = batch[1].to(device)

            outputs = model(  b_input_ids,
                                labels=b_labels, 
                                attention_mask = b_masks,
                                token_type_ids=None
                            )

            loss = outputs[0]/GRADIENT_ACCUMULATION_STEPS
            loss.backward()
            
            torch.nn.utils.clip_grad_norm_(model.parameters(), config["MAX_GRAD_NORM"])
            tr_loss += loss.item()

            if (step + 1) % GRADIENT_ACCUMULATION_STEPS == 0:
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1
                writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
                writer.add_scalar('loss', (tr_loss - logging_loss)/GRADIENT_ACCUMULATION_STEPS, global_step)
                logging_loss = tr_loss

            if (step + 1) % (10*GRADIENT_ACCUMULATION_STEPS) == 0:
                results = evaluate(model, valid_dataset, config["BATCH_SIZE"])
                for key, value in results.items():
                    writer.add_scalar('eval_{}'.format(key), value, global_step)
                model.train()

        if epoch % checkpoint_every == 0:
            # Save checkpoint
            model_path = os.path.join(model_dir, f"checkpoint_epoch{epoch}.pt")
            torch.save({
                    'epoch': epoch,
                    'global_step': global_step,
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'loss': loss,
                    'tr_loss': tr_loss
                    }, model_path)
            
            print(f"Saved checkpoint to {model_path}\n")

    if epoch % checkpoint_every != 0:
        # Save final checkpoint (if it wasn't already saved)
        model_path = os.path.join(model_dir, f"checkpoint_epoch{epoch}.pt")
        torch.save({
                'epoch': EPOCHS,
                'global_step': global_step,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': loss,
                'tr_loss': tr_loss
                }, model_path)
        print(f"Saved final checkpoint to {model_path}\n")
    print(f"Training complete!")


In [12]:
def evaluate(model, eval_dataset, batch_size):
    """ Returns perplexity score on validation dataset.
        global_step: no. of times gradients have backpropagated
        ignore_index: token not considered in loss calculation
    """
    eval_dataloader = DataLoader(eval_dataset, batch_size=batch_size)

    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()

    for batch in eval_dataloader:
        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)

        with torch.no_grad():
            outputs = model(  b_input_ids,
                                labels=b_labels, 
                                attention_mask = b_masks,
                                token_type_ids=None
                            )
            loss = outputs[0]  

        eval_loss += loss.item()

        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    perplexity = torch.exp(torch.tensor(eval_loss))

    return {"perplexity": perplexity, "loss": eval_loss}           


In [13]:
model = init_model(tokenizer, config["GPT_SIZE"])
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50259, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50259, bias=False)
)

In [None]:
train(model, train_dataset, val_dataset, config, checkpoint_every=5)