In [None]:
import glob
import json
import os
import random

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm, trange
from sklearn.metrics import f1_score, accuracy_score

import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, Dataset
from transformers import WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup
from transformers import BertConfig, BertTokenizer, BertForSequenceClassification

# Training Classifier on Labelled Data

## Parameters

In [None]:
device = torch.device('cuda')
MODEL_NAME = "./classification-outputs/"

TRAIN_DATA_FILE = ""
TRAIN_LABEL_FILE = ""
VAL_DATA_FILE = ""
VAL_LABEL_FILE = ""
OUTPUT_DIR = "./classification-outputs"

MAX_LENGTH = int(64)
RANDOM_SEED = 100

In [None]:
TRAIN_PARAMS = {
    'batch_size': 8,
    'learning_rate': 1e-5,
    'weight_decay': 1e-5,
    'adam_epsilon': 1e-8,
    'max_grad_norm': 1.0,
    'grad_accum_steps': 1,
    'warmup_steps': 500,
    'checkpoint_steps': 500,
    'checkpoint_dir': OUTPUT_DIR,
    'eval_steps': 250,
    'num_train_epochs': 1,
    'max_steps': -1, # if >0, overrides num_train_epochs
    'checkpoint': MODEL_NAME if MODEL_NAME.endswith('.pt') else None

## Load Model

In [None]:
CONFIG = BertConfig.from_pretrained(MODEL_NAME)
CONFIG.num_labels = 3
TOKENIZER = BertTokenizer.from_pretrained(MODEL_NAME, do_lower_case = False, config = CONFIG)
MODEL = BertForSequenceClassification.from_pretrained(MODEL_NAME, config = CONFIG)

#MODEL.eval()

In [None]:
MODEL.to(device)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"number of parameters in the model={count_parameters(MODEL)}")

## Methods

In [None]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

### Dataset

In [None]:
class SemEvalDataset(Dataset):
    def __init__(self, source_path: str, target_path: str, tokenizer = TOKENIZER, block_size = MAX_LENGTH):
        assert os.path.isfile(source_path)
        assert os.path.isfile(target_path)
        print(f"Creating features from source dataset file at {source_path}")
        print(f"Creating features from target label file at {target_path}")
        
        with open(source_path) as f:
            source = [line.strip() for line in f.readlines()]
        with open(target_path) as f:
            target = [line.strip() for line in f.readlines()]
        assert len(source) == len(target)
        
        self.ids = []
        self.masks = []
        self.labels = [int(line) for line in target]
        
        for line in tqdm(source, leave = False):
            tokenized_text = tokenizer.encode(line, max_length = block_size, pad_to_max_length = True)
            self.ids.append(tokenized_text)
            self.masks.append([int(token_id > 0) for token_id in tokenized_text])
            
    def __len__(self):
        return len(self.ids)
    
    def __getitem__(self, i):
        return torch.tensor(self.ids[i]), torch.tensor(self.masks[i]), torch.tensor(self.labels[i])

### Evaluation Function

In [None]:
def evaluate(dataset, model, tokenizer, batch_size, key, max_steps = None, device = torch.device('cpu')):
    sampler = SequentialSampler(dataset)
    dataloader = DataLoader(dataset, sampler = sampler, batch_size = batch_size)
    
    print(f"Beginning evaluation on dataset {key}")
    
    eval_loss = 0.0
    num_steps = 0
    preds = []
    targets = []
    
    iterator = tqdm(dataloader, desc='evaluating', leave = False, total = max_steps)
    
    for batch in iterator:
        model.eval()
        batch = tuple(t.to(device) for t in batch)
        inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[2].unsqueeze(0)}
        
        with torch.no_grad():
            outputs = model(**inputs)
            loss, logits = outputs[:2]
            eval_loss += loss.mean().item()
        num_steps += 1
        
        preds.extend([np.argmax(x) for x in logits.detach().cpu().numpy()])
        targets.extend(inputs['labels'].detach().cpu().numpy()[0])
        
        if max_steps:
            if num_steps >= max_steps:
                iterator.close()
                break
        
        eval_loss = eval_loss / num_steps
        
        y_true, y_pred = np.array(targets), np.array(preds)
        accuracy = accuracy_score(y_true, y_pred, sample_weight=None)
        
        print(f'accuracy:{accuracy}')
        
        return accuracy

### Training Function

In [None]:
def train(dataset, model = MODEL, tokenizer = TOKENIZER, params = TRAIN_PARAMS, val_dataset = None, device = torch.device('cpu')):
    sampler = RandomSampler(dataset)
    dataloader = DataLoader(dataset, sampler = sampler, batch_size = params['batch_size'])
    
    if params['max_steps'] > 0:
        total_steps = params['max_steps']
        num_epochs = params['max_steps'] // len(dataloader) // params['grad_accum_steps'] + 1
    else:
        total_steps = len(dataloader) // params['grad_accum_steps'] * params['num_train_epochs']
        num_epochs = params['num_train_epochs']
    
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_params = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': params['weight_decay']},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    optimizer = AdamW(optimizer_params, lr = params['learning_rate'], eps = params['adam_epsilon'])
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = params['warmup_steps'], num_training_steps = total_steps)
    
    print('training...')
    print(f"num examples = \t\t\t{len(dataset)}")
    print(f"num epochs = \t\t\t{num_epochs}")
    
    if params['grad_accum_steps'] > 1:
        print(f"gradient accumulation steps = \t{params['grad_accum_steps']}")
        print(f"batch size with accumulation = \t{params['batch_size']}")
    else:
        print(f"batch size = \t\t\t{params['batch_size']}")
        print(f"total optimization steps = \t{total_steps}")
    
    global_steps = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    
    if params['checkpoint']:
        opt_path = os.path.join(params['checkpoint'], 'optimizer.pt')
        sch_path = os.path.join(params['checkpoint'], 'scheduler.pt')
        if os.path.isfile(opt_path) and os.path.isfile(sch_path):
            print("\nupdating optimizer and scheduler from checkpoint")
            optimizer.load_state_dict(torch.load(opt_path))
            scheduler.load_state_dict(torch.load(sch_path))
        
        try:
            global_step = int(params['checkpoint'].split('-')[-1].split('/')[0])
            epochs_trained = global_step // len(dataloader) // params['grad_accum_steps']
            steps_trained_in_current_epoch = global_step % (len(dataloader) // params['grad_accum_steps'])
            print(f"\npicking up from checkpoint at global step:\t{global_step}")
            print(f"continuing training from epoch:\t{epochs_trained}")
            print(f"skipping first steps in epoch:\t\t{steps_trained_in_current_epoch}")
        except ValueError:
            print('could not update current steps/epoch form checkpoint name')
    
    train_loss, logging_loss = 0.0, 0.0
    
    model.resize_token_embeddings(len(tokenizer))
    model.zero_grad()
    
    train_iterator = trange(epochs_trained, num_epochs, desc='epoch')
    
    for _ in train_iterator:
        epoch_iterator = tqdm(dataloader, desc='iteration')
        
        for step, batch in enumerate(epoch_iterator):
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue
            
            batch = tuple(t.to(device) for t in batch)
            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[2]}
            
            model.train()
            
            outputs = model(**inputs)
            loss = outputs[0]
            
            if params['grad_accum_steps'] > 1:
                loss = loss / params['grad_accum_steps']
            
            loss.backward()
            
            train_loss += loss.item()
            
            if (step+1) % params['grad_accum_steps'] == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), params['max_grad_norm'])
                optimizer.step()
                scheduler.step()
                model.zero_grad()
                global_step += 1
                
                if params['checkpoint_steps'] > 0 and global_step % params['checkpoint_steps'] == 0:
                    save_path = os.path.join(params['checkpoint_dir'], f"checkpoint -- {global_step}")
                    os.makedirs(save_path, exist_ok=True)
                    model.save_pretrained(save_path)
                    print(f"saving model checkpoint to:\t{save_path}")
                    torch.save(params, os.path.join(save_path, 'training_args.bin'))
                    torch.save(optimizer.state_dict(), os.path.join(save_path, "optimizer.pt"))
                    torch.save(scheduler.state_dict(), os.path.join(save_path, "scheduler.pt"))
                    
                if params['eval_steps'] > 0 and global_step % params['eval_steps'] == 0:
                    if val_dataset:
                        evaluate(val_dataset, model, tokenizer, params['batch_size'], 'val', device=device)
                        evaluate(dataset, model, tokenizer, params['batch_size'], 'train', max_steps = 200, device=device)
                    print(f"loss:\t\t\t{train_loss / global_step}")
                
            if params['max_steps'] > 0 and global_step > params['max_steps']:
                epoch_iterator.close()
                break
        
        if params['max_steps'] > 0 and global_step > params['max_steps']:
            train_iterator.close()
            break
        
    print(f"saving final model to:\t{params['checkpoint_dir']}")
    model.save_pretrained(params['checkpoint_dir'])
    torch.save(params, os.path.join(param['checkpoint_dir'], 'training_args.bin'))
    
    return global_step, train_loss / global_step

## Training

In [None]:
set_seed(RANDOM_SEED)

In [None]:
if 'train_dataset' not in globals():
    train_dataset = SemEvalDataset(source_path = TRAIN_DATA_FILE, target_path = TRAIN_LABEL_FILE)
    val_dataset = SemEvalDataset(source_path = VAL_DATA_FILE, target_path = VAL_LABEL_FILE)

In [None]:
global_steps, train_loss = train(train_dataset, val_dataset=val_dataset, device=device)
print(f"global_steps={global_step}, average loss={train_loss}")

## Evaluation

In [None]:
evaluate(val_dataset, MODEL, TOKENIZER, TRAIN_PARAMS['batch_size'], 'val', device=device)
evaluate(train_dataset, MODEL, TOKENIZER, TRAIN_PARAMS['batch_size'], 'train', device=device)