<a href="https://colab.research.google.com/github/ZahraDehghani99/Ensembling-HuggingFaceTransformer-models/blob/main/Ensembling_medium.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
dataset = [["What is the capital of Russia?", "The capital of Russia is Moscow.", 1],
           ["What is the capital of India?", "The capital of Russia is Delhi.", 1],
           ["What is the capital of United States?", "The capital of Russia is Washington.", 1],
           ["What is the capital of Germany?", "The capital of Russia is Berlin.", 1],
           ["What is the capital of France?", "The capital of Russia is Paris.", 1],
           ["What is the capital of Russia?", "Goku loves chi chi.", 0],
           ["What is the capital of India?", "Gohan is better than Goku for sure.", 0],
           ["What is the capital of United States?", "Freeza has to freeze.", 0],
           ["What is the capital of Germany?", "Einstien should have nuked Hitler.", 0],
           ["What is the capital of France?", "Newton lost it when the apple fell on his head.", 0]]

In [None]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/82/25/89050e69ed53c2a3b7f8c67844b3c8339c1192612ba89a172cf85b298948/transformers-3.0.1-py3-none-any.whl (757kB)
[K     |████████████████████████████████| 757kB 2.8MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 31.1MB/s 
Collecting tokenizers==0.8.0-rc4
[?25l  Downloading https://files.pythonhosted.org/packages/e8/bd/e5abec46af977c8a1375c1dca7cb1e5b3ec392ef279067af7f6bc50491a0/tokenizers-0.8.0rc4-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K     |████████████████████████████████| 3.0MB 40.6MB/s 
[?25hCollecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl 

In [None]:
import numpy as np

import torch
from torch import nn
from transformers import BertPreTrainedModel, BertConfig, BertModel, BertTokenizer, AdamW

import torch
import torch.nn as nn
from torch import optim
from torch.utils.data import DataLoader, RandomSampler, Dataset, SequentialSampler

# The core model that ensembles

In [None]:
class BertEnsembleForNextSentencePrediction(BertPreTrainedModel):
  def __init__(self, config, *args, **kwargs):
      super().__init__(config)

      # model for QA
      self.bert_model_1 = BertModel(config)
      # model for AQ
      self.bert_model_2 = BertModel(config)
      # combine the 2 models into 1
      self.cls = nn.Linear(2 * self.config.hidden_size, 2)
      self.init_weights()

  def forward(
          self,
          input_ids=None,
          attention_mask=None,
          token_type_ids=None,
          position_ids=None,
          head_mask=None,
          inputs_embeds=None,
          next_sentence_label=None,
  ):
    outputs = []
    input_ids_1 = input_ids[0]
    attention_mask_1 = attention_mask[0]
    outputs.append(self.bert_model_1(input_ids_1,
                                     attention_mask=attention_mask_1))

    input_ids_2 = input_ids[1]
    attention_mask_2 = attention_mask[1]
    outputs.append(self.bert_model_2(input_ids_2,
                                     attention_mask=attention_mask_2))

    # just get the [CLS] embeddings
    last_hidden_states = torch.cat([output[1] for output in outputs], dim=1)
    logits = self.cls(last_hidden_states)

    # crossentropyloss: https://pytorch.org/docs/stable/nn.html#crossentropyloss
    if next_sentence_label is not None:
      loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
      next_sentence_loss = loss_fct(logits.view(-1, 2), next_sentence_label.view(-1))
      return next_sentence_loss, logits
    else:
      return logits

In [None]:
device = torch.device("cuda")

config = BertConfig()
model = BertEnsembleForNextSentencePrediction(config)
model.to(device)
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
learning_rate = 1e-5

no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [{
  "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
  }]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)

# Prepare the dataset as a generator

In [None]:
def prepare_data(dataset, qa=True):
  input_ids, attention_masks = [], []
  labels = []
  for point in dataset:
    if qa is True:
      q, a, _ = point
    else:
      a, q, _ = point
    encoded_dict = tokenizer.encode_plus(
      q,  # Sentence 1 to encode.
      a,  # Sentence 2 to encode.
      add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
      max_length=128,  # Pad & truncate all sentences.
      pad_to_max_length=True,
      return_attention_mask=True,  # Construct attn. masks.
      return_tensors='pt',  # Return pytorch tensors.
      truncation=True
    )
    input_ids.append(encoded_dict["input_ids"])
    attention_masks.append(encoded_dict["attention_mask"])
    labels.append(point[-1])
  input_ids = torch.cat(input_ids, dim=0)
  attention_masks = torch.cat(attention_masks, dim=0)
  return input_ids, attention_masks, labels

In [None]:
class QADataset(Dataset):
  """
  returns the input_ids tensor and attention_mask tensor
  """
  def __init__(self, input_ids, attention_masks, labels=None):
    self.input_ids = np.array(input_ids)
    self.attention_masks = np.array(attention_masks)
    self.labels = torch.tensor(labels, dtype=torch.long)

  def __getitem__(self, index):
    return self.input_ids[index], self.attention_masks[index], self.labels[index]

  def __len__(self):
    return self.input_ids.shape[0]

# Train

In [None]:
# standard pytorch way of doing things
# 1. create a custom Dataset
# 2. pass the dataset to a dataloader
# 3. iterate the dataloader and pass the inputs to the model

input_ids_qa, attention_masks_qa, labels_qa = prepare_data(dataset)
train_dataset_qa = QADataset(input_ids_qa, attention_masks_qa, labels_qa)

input_ids_aq, attention_masks_aq, labels_aq = prepare_data(dataset, qa=False)
train_dataset_aq = QADataset(input_ids_aq, attention_masks_aq, labels_aq)

dataloader_qa =  DataLoader(dataset=train_dataset_qa,
                            batch_size=5,
                            sampler=SequentialSampler(train_dataset_qa))
dataloader_aq =  DataLoader(dataset=train_dataset_aq,
                            batch_size=5,
                            sampler=SequentialSampler(train_dataset_aq))
epochs = 5
for epoch in range(epochs):
  # iterate the QA and the AQ inputs simultaneously
  for step, combined_batch in enumerate(zip(dataloader_qa, dataloader_aq)):
    batch_1, batch_2 = combined_batch
    # training so, dropout needed to avoid overfitting
    model.train()

    # move input to GPU
    batch_1 = tuple(t.to(device) for t in batch_1)
    batch_2 = tuple(t.to(device) for t in batch_2)
    inputs = {
        "input_ids": [batch_1[0], batch_2[0]],
        "attention_mask": [batch_1[1], batch_2[1]],
        "next_sentence_label": batch_1[2]
    }
    outputs = model(**inputs)

    # model outputs are always tuple in transformers (see doc)
    loss = outputs[0]
    # backpass
    loss.backward()
    print(f"epoch:{epoch}, loss:{loss}")

    # re-calculate the weights
    optimizer.step()
    # again set the grads to 0 for next epoch
    model.zero_grad()

  print("\n")

tensor([[-0.1286,  0.2924],
        [-0.1297, -0.0054],
        [-0.0617,  0.1272],
        [-0.0929,  0.2750],
        [-0.3805,  0.1876]], device='cuda:0', grad_fn=<AddmmBackward>)
epoch:0, loss:0.543137788772583
tensor([[-1.8321,  1.4266],
        [-1.4045,  1.4116],
        [-1.6396,  1.6787],
        [-1.7913,  1.5584],
        [-1.6969,  1.6180]], device='cuda:0', grad_fn=<AddmmBackward>)
epoch:0, loss:3.251826524734497


tensor([[-1.3782,  1.3265],
        [-1.4314,  1.3607],
        [-1.1658,  1.1141],
        [-1.5283,  1.1850],
        [-1.0746,  0.9862]], device='cuda:0', grad_fn=<AddmmBackward>)
epoch:1, loss:0.08113940805196762
tensor([[-0.9390,  0.7318],
        [-0.9579,  0.8381],
        [-1.0769,  0.7628],
        [-0.7682,  0.7492],
        [-0.7741,  0.7154]], device='cuda:0', grad_fn=<AddmmBackward>)
epoch:1, loss:1.837673544883728


tensor([[-0.1964,  0.3336],
        [-0.1507,  0.2372],
        [-0.4361, -0.0579],
        [-0.3163,  0.1649],
        [ 0.0985,  0.1

# Test

In [None]:
# training and testing on the same dataset. Just for illustration. Never do in real.

# standard pytorch way of doing things
# 1. create a custom Dataset
# 2. pass the dataset to a dataloader
# 3. iterate the dataloader and pass the inputs to the model

input_ids_qa, attention_masks_qa, labels_qa = prepare_data(dataset)
test_dataset_qa = QADataset(input_ids_qa, attention_masks_qa, labels_qa)

input_ids_aq, attention_masks_aq, labels_aq = prepare_data(dataset, qa=False)
test_dataset_aq = QADataset(input_ids_aq, attention_masks_aq, labels_aq)

dataloader_qa =  DataLoader(dataset=test_dataset_qa,
                            batch_size=16,
                            sampler=SequentialSampler(test_dataset_qa))
dataloader_aq =  DataLoader(dataset=test_dataset_aq,
                            batch_size=16,
                            sampler=SequentialSampler(test_dataset_aq))

complete_outputs, complete_label_ids = [], []

# iterate the QA and the AQ inputs simultaneously
for step, combined_batch in enumerate(zip(dataloader_qa, dataloader_aq)):
  # only forward pass so no dropout
  model.eval()
  batch_1, batch_2 = combined_batch

  # move input to GPU
  batch_1 = tuple(t.to(device) for t in batch_1)
  batch_2 = tuple(t.to(device) for t in batch_2)

  # no back pass so no need to track variables for differentiation
  with torch.no_grad():
    inputs = {
        "input_ids": [batch_1[0], batch_2[0]],
        "attention_mask": [batch_1[1], batch_2[1]],
        "next_sentence_label": batch_1[2]
    }
    outputs = model(**inputs)
    tmp_eval_loss, logits = outputs[:2]
    logits = logits.detach().cpu().numpy()
    outputs = np.argmax(logits, axis=1)
    label_ids = inputs["next_sentence_label"].detach().cpu().numpy()
  complete_outputs.extend(outputs)
  complete_label_ids.extend(label_ids)

print(complete_outputs, complete_label_ids)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0] [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
