In [1]:
import os
import random
import re
from typing import Dict, List, Tuple

import numpy as np
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from tqdm.notebook import tqdm, trange
from tensorboardX import SummaryWriter

from transformers import WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup
from transformers import BertConfig, BertForMaskedLM, BertTokenizer

# Fine-tuning with Unlabelled Data

## Parameters

In [3]:
device = torch.device('cuda') #or cpu
MODEL_NAME = "bert-base-cased" #or one of the other pretrained models listed at https://huggingface.co/transformers/pretrained_models.html

TRAIN_DATA_FILE = "D:/Users/Beth/Documents/tweet_data/cc_tweet_data.txt"
OUTPUT_DIR = "D:/Users/Beth/Documents/climate-change-emo-analysis/fine-tuning-outputs/"

MAX_LENGTH = int(64)
RANDOM_SEED = 100

In [4]:
TRAIN_PARAMS = {
    'batch_size': 8,
    'learning_rate': 1e-5,
    'weight_decay': 1e-5,
    'adam_epsilon': 1e-8,
    'max_grad_norm': 1.0,
    'grad_accum_steps': 1,
    'warmup_steps': 500,
    'checkpoint_steps': 2500,
    'checkpoint_dir': OUTPUT_DIR,
    'eval_steps': 250,
    'num_train_epochs': 1,
    'max_steps': -1, # if >0, overrides num_train_epochs
    'checkpoint': MODEL_NAME if MODEL_NAME.endswith('.pt') else None,
    'mlm_probability': 0.15
}

## Load Model

In [5]:
CONFIG = BertConfig.from_pretrained(MODEL_NAME)
TOKENIZER = BertTokenizer.from_pretrained(MODEL_NAME, do_lower_case = False, config = CONFIG)
MODEL = BertForMaskedLM.from_pretrained(MODEL_NAME, config = CONFIG)

#deactivate dropout for reproducible results in evaluation, comment out otherwise
MODEL.eval()

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [6]:
MODEL.to(device)

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [7]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"number of parameters in the model={count_parameters(MODEL)}")

number of parameters in the model=108931396


## Methods

In [8]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

### Dataset

In [9]:
class LineByLineDataset(Dataset):
    def __init__(self, file_path: str, tokenizer=TOKENIZER, block_size=MAX_LENGTH):
        assert os.path.isfile(file_path)
        print(f"Creating features from dataset file at {file_path}")
        
        with open(file_path, encoding="utf-8") as f:
            lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]
        
        self.examples = tokenizer.batch_encode_plus(lines, add_special_tokens=True, max_length=block_size)["input_ids"]
    
    def __len__(self):
        return len(self.examples)
    
    def __getitem__(self, i):
        return torch.tensor(self.examples[i], dtype=torch.long)

### Mask Tokens

In [10]:
def mask_tokens(inputs: torch.Tensor, tokenizer=TOKENIZER, mlm_probability=0.15) -> Tuple[torch.Tensor, torch.Tensor]:
    """ Prepare masked token inputs/labels for language modelling: 80% MASK, 10% random, 10% original """
    labels = inputs.clone()
    probability_matrix = torch.full(labels.shape, mlm_probability)
    special_tokens_mask = [
        tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
    ]
    probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
    if tokenizer._pad_token is not None:
        padding_mask = labels.eq(tokenizer.pad_token_id)
        probability_matrix.masked_fill_(padding_mask, value = 0.0)
    masked_indices = torch.bernoulli(probability_matrix).bool()
    labels[~masked_indices] = -100
    
    #80% of the time, we replaced masked input tokens with tokenizer.mask_token ([MASK])
    indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
    inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)
    
    #10% of the time, we replace masked input tokens with random word
    indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
    random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long)
    inputs[indices_random] = random_words[indices_random]
    
    #the rest of the time (10% of the time) we keep the masked input tokens unchanged
    return inputs, labels

### Evaluation Function

In [11]:
def evaluate(dataset, model, tokenizer, batch_size, key, mlm_probability=0.15, max_steps=None, device=torch.device('cuda')) -> Dict:
    
    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)
    
    sampler = SequentialSampler(dataset)
    dataloader = DataLoader(
        dataset, sampler=sampler, batch_size=batch_size, collate_fn=collate
    )
    
    print(f"Beginning evaluation on dataset {key}")
    
    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()
    
    for batch in tqdm(dataloader, desc="Evaluating", leave = False,):
        inputs, labels = mask_tokens(batch, tokenizer, mlm_probability)
        inputs = inputs.to(device)
        labels = labels.to(device)
        
        with torch.no_grad():
            outputs = model(inputs, masked_lm_labels=labels)
            lm_loss = outputs[0]
            eval_loss += lm_loss.mean().item()
        nb_eval_steps += 1
    
    eval_loss = eval_loss / nb_eval_steps
    perplexity = torch.exp(torch.tensor(eval_loss))
    
    print(f"perplexity: {perplexity}")
    return perplexity

### Training Function

In [12]:
def train(dataset, model=MODEL, tokenizer=TOKENIZER, params = TRAIN_PARAMS, val_dataset = None, device=torch.device("cuda")):
    
    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)
    
    sampler = RandomSampler(dataset)
    dataloader = DataLoader(dataset,
        sampler = sampler,
        batch_size = params['batch_size'],
        collate_fn = collate
    )
    
    if params['max_steps'] > 0:
        t_total = params['max_steps']
        num_epochs = params['max_steps'] // len(dataloader) // params['grad_accum_steps'] + 1
    else:
        t_total = len(dataloader) // params['grad_accum_steps'] * params['num_train_epochs']
        num_epochs = params['num_train_epochs']
    
    #prepare optimizer and scheduler (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_params = [
        {"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": params["weight_decay"]},
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}
    ]
    optimizer = AdamW(optimizer_params, lr=params["learning_rate"], eps=params["adam_epsilon"])
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=params["warmup_steps"], num_training_steps=t_total)
    print("training...")
    print(f"num examples:\t\t\t{len(dataset)}")
    print(f"num epochs:\t\t\t{num_epochs}")
    if params["grad_accum_steps"] > 1:
        print("gradient accumulation steps:\t{}".format(params["grad_accum_steps"]))
        print("batch size with accumulation:\t{}".format(params["batch_size"]))
    else:
        print("batch size:\t\t\t{}".format(params["batch_size"]))
    print(f"total optimization steps:\t{t_total}")
    
    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    if params['checkpoint']:
        opt_path = os.path.join(params['checkpoint'], 'optimizer.pt')
        sch_path = os.path.join(params['checkpoint'], 'scheduler.pt')
        if os.path.isfile(opt_path) and os.path.isfile(sch_path):
            print("\nupdating optimizer and scheduler from checkpoint")
            optimizer.load_state_dict(torch.load(opt_path))
            scheduler.load_state_dict(torch.load(sch_path))
        
        try:
            global_step = int(params['checkpoint'].split('-')[-1].split('/')[0])
            epochs_trained = global_step // len(dataloader) // params['grad_accum_steps']
            steps_trained_in_current_epoch = global_step % (len(dataloader) // param['grad_accum_steps'])
            print(f"\npicking up from checkpoint at global step:\t{global_step}")
            print(f"continuing training from epoch:\t\t{epochs_trained}")
            print(f"skipping first steps in epoch:\t\t{steps_trained_in_current_epoch}")
        except ValueError:
            print("could not update current steps/epoch from checkpoint name")
    
    training_loss, logging_loss = 0.0, 0.0
    
    model.resize_token_embeddings(len(tokenizer))
    model.zero_grad()
    
    train_iterator = trange(epochs_trained, num_epochs, desc="epoch")
    
    for _ in train_iterator:
        epoch_iterator = tqdm(dataloader, desc="iteration")
        
        for step, batch in enumerate(epoch_iterator):
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue
            
            inputs, labels = mask_tokens(batch, tokenizer, params['mlm_probability'])
            inputs = inputs.to(device)
            labels = labels.to(device)
            model.train()
            outputs = model(inputs, masked_lm_labels=labels)
            loss = outputs[0]
            
            if params['grad_accum_steps'] > 1:
                loss = loss / params['grad_accum_steps']
            
            loss.backward()
            
            training_loss += loss.item()
            
            if (step+1) % params['grad_accum_steps'] == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), params['max_grad_norm'])
                optimizer.step()
                scheduler.step()
                model.zero_grad()
                global_step += 1
                
                if params['checkpoint_steps'] > 0 and global_step % params['checkpoint_steps'] == 0:
                    save_path = os.path.join(params['checkpoint_dir'], f"checkpoint--{global_step}")
                    os.makedirs(save_path, exist_ok=True)
                    
                    model.save_pretrained(save_path)
                    #tokenizer.save_pretrained(save_path)
                    
                    print(f"saving model checkpoint to:\t{save_path}")
                    torch.save(params, os.path.join(save_path, 'training_args.bin'))
                    torch.save(optimizer.state_dict(), os.path.join(save_path, 'optimizer.pt'))
                    torch.save(scheduler.state_dict(), os.path.join(save_path, 'scheduler.pt'))
                    
                if params['eval_steps'] > 0 and global_step % params['eval_steps'] == 0:
                    if val_dataset:
                        evaluate(val_dataset, model, tokenizer, params['batch_size'], params['mlm_probability'], 'val', device=device)
                        evaluate(dataset, model, tokenizer, params['batch_size'], params['mlm_probability'], 'train', max_steps = 200, device=device)
                    print(f"loss:\t\t\t{training_loss/global_step}")
                    print()
            
            if params['max_steps'] > 0 and global_step > params["max_steps"]:
                epoch_iterator.close()
                break
                
        if params['max_steps'] > 0 and global_step > params['max_steps']:
            train_iterator.close()
            break
        
    print('saving final model to:\t', params['checkpoint_dir'])
    model.save_pretrained(params['checkpoint_dir'])
    #tokenizer.save_pretrained(params['checkpoint_dir'])
    torch.save(params, os.path.join(params['checkpoint_dir'], 'training_args.bin'))
    
    return global_step, training_loss / global_step

## Evaluate perplexity of BERT on this data prior to fine-tuning

In [14]:
if 'train_dataset' not in globals():
    train_dataset = LineByLineDataset(TRAIN_DATA_FILE)
    #val_dataset = LineByLineDataset(VAL_DATA_FILE)

Creating features from dataset file at D:/Users/Beth/Documents/tweet_data/cc_tweet_data.txt


In [15]:
set_seed(RANDOM_SEED)

In [16]:
evaluate(train_dataset, MODEL, TOKENIZER, TRAIN_PARAMS['batch_size'], 'train', TRAIN_PARAMS['mlm_probability'], device=device)

Beginning evaluation on dataset train


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=121042.0, style=ProgressStyle(descriptio…

perplexity: 46.09406280517578


tensor(46.0941)

## Fine-tune on unlabelled tweets

In [17]:
global_step, train_loss = train(train_dataset, device=device)
print(f"global step={global_step}, average loss={train_loss}")

training...
num examples:			968331
num epochs:			1
batch size:			8
total optimization steps:	121042


HBox(children=(FloatProgress(value=0.0, description='epoch', max=1.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='iteration', max=121042.0, style=ProgressStyle(description…

loss:			3.606213059425354

loss:			3.339893476366997

loss:			3.2189694543679557

loss:			3.1350197797417643

loss:			3.0648388804912567

loss:			3.0124473676284156

loss:			2.981946626492909

loss:			2.93802492287755

loss:			2.9019777399169073

saving model checkpoint to:	D:/Users/Beth/Documents/climate-change-emo-analysis/fine-tuning-outputs/checkpoint--2500
loss:			2.867158103632927

loss:			2.8477999869476665

loss:			2.818963876903057

loss:			2.799591643388455

loss:			2.7840047210880687

loss:			2.768163042314847

loss:			2.7536722036376595

loss:			2.7389468030158213

loss:			2.7228735106852318

loss:			2.7144177089051196

saving model checkpoint to:	D:/Users/Beth/Documents/climate-change-emo-analysis/fine-tuning-outputs/checkpoint--5000
loss:			2.7029285488426686

loss:			2.695339107257979

loss:			2.6867705233801495

loss:			2.6801998217468674

loss:			2.6673623916258413

loss:			2.6596703480672836

loss:			2.6518531059347668

loss:			2.641180512821233

loss:			2.63631385342

## Evaluate perplexity of fine-tuned model

In [18]:
evaluate(train_dataset, MODEL, TOKENIZER, TRAIN_PARAMS['batch_size'], 'train', TRAIN_PARAMS['mlm_probability'], device=device)

Beginning evaluation on dataset train


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=121042.0, style=ProgressStyle(descriptio…

perplexity: 6.686666488647461


tensor(6.6867)