# Entailment using ClimateBERT

Train a model on the text entailment task using the base model, [ClimateBERT](https://huggingface.co/climatebert/distilroberta-base-climate-f). ClimateBERT is trained on 1.6 million paragraphs of climate-related texts. For more information, check out their [research paper](https://arxiv.org/abs/2110.12010).

In [1]:
base_model = 'distilroberta-base'
model_checkpoint = 'climatebert/distilroberta-base-climate-f'

This notebook follows the work presented [here](https://github.com/dh1105/Sentence-Entailment/blob/main/Sentence_Entailment_BERT.ipynb)

## Environment

Install and import packages

In [2]:
! pip install transformers
! pip install torch
! pip install datasets



In [3]:
import numpy as np
import pandas as pd

from html import unescape
from random import randint
import math
import gc

from transformers import pipeline                                                   
from transformers.pipelines.pt_utils import KeyDataset
#import datasets
from datasets import load_dataset, load_metric, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import AdamW
from huggingface_hub import notebook_login
import torch

import pickle
import os
import torch
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

In [4]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


Log into huggingface

In [5]:
# get access token on Huggingface website > settings > access token (make sure it's a write token)
!git config --global credential.helper store
notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token


## Read in data

The data used is the original Climate Fever dataset (HuggingFace repo, paper, github), with slight data processing to get it into the correct format for textual entailment.

In [6]:
ds_path = 'amandakonet/climate_fever_nli'
# use_auth_token must be true bc this is a private dataset
ds = load_dataset(ds_path, use_auth_token=True)

Using custom data configuration amandakonet--climate_fever_adopted-2399aead42b0d456
Reusing dataset parquet (/root/.cache/huggingface/datasets/parquet/amandakonet--climate_fever_adopted-2399aead42b0d456/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901)


  0%|          | 0/3 [00:00<?, ?it/s]

Note the columns: claim_id, claim, evidence, label, and category. There is one row for every (claim, evidence) pair. Since each claim has 5 associated evidences, there are 5 rows for each claim.

In [7]:
ds

DatasetDict({
    train: Dataset({
        features: ['claim_id', 'claim', 'evidence', 'label', 'category'],
        num_rows: 4298
    })
    valid: Dataset({
        features: ['claim_id', 'claim', 'evidence', 'label', 'category'],
        num_rows: 1842
    })
    test: Dataset({
        features: ['claim_id', 'claim', 'evidence', 'label', 'category'],
        num_rows: 1535
    })
})

Set up our dataset according to the correct format for NLI.

In [8]:
class MNLIDataBert(Dataset):

  #def __init__(self, base_model, train_df, val_df):
  def __init__(self, ds, base_model):
    #self.label_dict = {'entailment': 0, 'contradiction': 1, 'neutral': 2}
    self.label_dict = {'SUPPORTS' : 0, 'REFUTES':1, 'NOT_ENOUGH_INFO' : 2}

    self.train_df = ds['train']
    self.val_df = ds['valid']

    self.base_path = '/content/'
    # pretrained base model tokenizer
    self.tokenizer = AutoTokenizer.from_pretrained(base_model, do_lower_case=True)
    self.train_data = None
    self.val_data = None
    self.init_data()

  def init_data(self):
    self.train_data = self.load_data(self.train_df)
    self.val_data = self.load_data(self.val_df)

  def load_data(self, df):
    MAX_LEN = 512
    token_ids = []
    mask_ids = []
    seg_ids = []
    y = []

    premise_list = df['claim']
    hypothesis_list = df['evidence']
    label_list = df['label']

    for (premise, hypothesis, label) in zip(premise_list, hypothesis_list, label_list):
      premise_id = self.tokenizer.encode(premise, add_special_tokens = False)
      hypothesis_id = self.tokenizer.encode(hypothesis, add_special_tokens = False)
      pair_token_ids = [self.tokenizer.cls_token_id] + premise_id + [self.tokenizer.sep_token_id] + hypothesis_id + [self.tokenizer.sep_token_id]
      premise_len = len(premise_id)
      hypothesis_len = len(hypothesis_id)

      segment_ids = torch.tensor([0] * (premise_len + 2) + [1] * (hypothesis_len + 1))  # sentence 0 and sentence 1
      attention_mask_ids = torch.tensor([1] * (premise_len + hypothesis_len + 3))  # mask padded values

      token_ids.append(torch.tensor(pair_token_ids))
      seg_ids.append(segment_ids)
      mask_ids.append(attention_mask_ids)
      y.append(self.label_dict[label])
    
    token_ids = pad_sequence(token_ids, batch_first=True)
    mask_ids = pad_sequence(mask_ids, batch_first=True)
    seg_ids = pad_sequence(seg_ids, batch_first=True)
    y = torch.tensor(y)
    dataset = TensorDataset(token_ids, mask_ids, seg_ids, y)
    print(len(dataset))
    return dataset

  def get_data_loaders(self, batch_size=32, shuffle=True):
    train_loader = DataLoader(
      self.train_data,
      shuffle=shuffle,
      batch_size=batch_size
    )

    val_loader = DataLoader(
      self.val_data,
      shuffle=shuffle,
      batch_size=batch_size
    )

    return train_loader, val_loader

In [9]:
mnli_dataset = MNLIDataBert(ds, base_model)
train_loader, val_loader = mnli_dataset.get_data_loaders(batch_size=8)

4298
1842


Next, we initialize a model from the ClimateBert checkpoint and set up the optimizer for training. 

In [10]:
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=3)
model.to(device)

Some weights of the model checkpoint at climatebert/distilroberta-base-climate-f were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at climatebert/distilroberta-base-climate-f and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifi

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50500, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

Set up the optimizer. 

In [11]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5, correct_bias=False)



Set up our accuracy measurment

In [12]:
def multi_acc(y_pred, y_test):
  acc = (torch.log_softmax(y_pred, dim=1).argmax(dim=1) == y_test).sum().float() / float(y_test.size(0))
  return acc

Train!

In [13]:
import time

EPOCHS = 5

def train(model, train_loader, val_loader, optimizer):  
  total_step = len(train_loader)

  for epoch in range(EPOCHS):
    start = time.time()
    model.train()
    total_train_loss = 0
    total_train_acc  = 0
    for batch_idx, (pair_token_ids, mask_ids, seg_ids, y) in enumerate(train_loader):
      optimizer.zero_grad()
      pair_token_ids = pair_token_ids.to(device)
      mask_ids = mask_ids.to(device)
      seg_ids = seg_ids.to(device)
      labels = y.to(device)
      print(labels.size())
      # prediction = model(pair_token_ids, mask_ids, seg_ids)
      loss, prediction = model(pair_token_ids, 
                             token_type_ids=seg_ids, 
                             attention_mask=mask_ids, 
                             labels=labels).values()
      print(prediction.size())

      # loss = criterion(prediction, labels)
      acc = multi_acc(prediction, labels)

      loss.backward()
      optimizer.step()
      
      total_train_loss += loss.item()
      total_train_acc  += acc.item()

    train_acc  = total_train_acc/len(train_loader)
    train_loss = total_train_loss/len(train_loader)
    model.eval()
    total_val_acc  = 0
    total_val_loss = 0
    with torch.no_grad():
      for batch_idx, (pair_token_ids, mask_ids, seg_ids, y) in enumerate(val_loader):
        optimizer.zero_grad()
        pair_token_ids = pair_token_ids.to(device)
        mask_ids = mask_ids.to(device)
        seg_ids = seg_ids.to(device)
        labels = y.to(device)

        # prediction = model(pair_token_ids, mask_ids, seg_ids)
        loss, prediction = model(pair_token_ids, 
                             token_type_ids=seg_ids, 
                             attention_mask=mask_ids, 
                             labels=labels).values()
        
        # loss = criterion(prediction, labels)
        acc = multi_acc(prediction, labels)

        total_val_loss += loss.item()
        total_val_acc  += acc.item()

    val_acc  = total_val_acc/len(val_loader)
    val_loss = total_val_loss/len(val_loader)
    end = time.time()
    hours, rem = divmod(end-start, 3600)
    minutes, seconds = divmod(rem, 60)

    print(f'Epoch {epoch+1}: train_loss: {train_loss:.4f} train_acc: {train_acc:.4f} | val_loss: {val_loss:.4f} val_acc: {val_acc:.4f}')
    print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))


In [None]:
train(model, train_loader, val_loader, optimizer)