# Ensemble Fine-tuned Models (BERT, ALBERT, RoBERTa) on HANS Test

### Load Libraries, Check Hardware

In [1]:
import pandas as pd
import torch
import numpy as np
from torch.utils.data import Dataset, TensorDataset, DataLoader, SequentialSampler, RandomSampler
from torch.nn.utils.rnn import pad_sequence

In [2]:
!nvidia-smi

Sat Dec 16 07:32:48 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  On   | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0    27W / 300W |      0MiB / 16384MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


### Load Full Model (393K) checkpoints

In [4]:
from transformers import BertForSequenceClassification, AlbertForSequenceClassification, RobertaForSequenceClassification

bert = BertForSequenceClassification.from_pretrained('/home/allen/other/BERT_Full', num_labels=3)
bert.to(device)
albert = AlbertForSequenceClassification.from_pretrained("/home/allen/other/ALBERT_Full", num_labels=3)
albert.to(device)
roberta = RobertaForSequenceClassification.from_pretrained("/home/allen/other/RoBERTa_Full", num_labels=3)
roberta.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

### Load HANS Dataset and Define Dataloaders

In [5]:
from nlp import load_dataset
hans_dataset = load_dataset("hans")

2023-12-16 07:33:18.068474: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
from transformers import BertTokenizer, AlbertTokenizer, RobertaTokenizer
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
albert_tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', do_lower_case=True)
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

In [7]:
def bert_tokenize_function(examples):
    return bert_tokenizer(examples["premise"], examples["hypothesis"], truncation=True, padding=True)

def albert_tokenize_function(examples):
    return albert_tokenizer(examples["premise"], examples["hypothesis"], truncation=True, padding=True)

def roberta_tokenize_function(examples):
    return roberta_tokenizer(examples["premise"], examples["hypothesis"], truncation=True, padding=True)

bert_tokenized_hans = hans_dataset.map(bert_tokenize_function, batched=True)
albert_tokenized_hans = hans_dataset.map(albert_tokenize_function, batched=True)
roberta_tokenized_hans = hans_dataset.map(roberta_tokenize_function, batched=True)

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

In [8]:
from torch.utils.data import DataLoader
bert_tokenized_hans.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
albert_tokenized_hans.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
roberta_tokenized_hans.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

In [9]:
from torch.nn.utils.rnn import pad_sequence

def bert_collate_batch(batch):
    input_ids = [item['input_ids'] for item in batch]
    attention_mask = [item['attention_mask'] for item in batch]
    labels = [item['label'] for item in batch]

    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=bert_tokenizer.pad_token_id)
    attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0)
    labels = torch.tensor(labels)

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }

def albert_collate_batch(batch):
    input_ids = [item['input_ids'] for item in batch]
    attention_mask = [item['attention_mask'] for item in batch]
    labels = [item['label'] for item in batch]

    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=albert_tokenizer.pad_token_id)
    attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0)
    labels = torch.tensor(labels)

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }


def roberta_collate_batch(batch):
    input_ids = [item['input_ids'] for item in batch]
    attention_mask = [item['attention_mask'] for item in batch]
    labels = [item['label'] for item in batch]

    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=roberta_tokenizer.pad_token_id)
    attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0)
    labels = torch.tensor(labels)

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }


In [10]:
bert_hans_loader = DataLoader(bert_tokenized_hans['validation'], batch_size=32, collate_fn=bert_collate_batch)
albert_hans_loader = DataLoader(bert_tokenized_hans['validation'], batch_size=32, collate_fn=albert_collate_batch)
roberta_hans_loader = DataLoader(bert_tokenized_hans['validation'], batch_size=32, collate_fn=roberta_collate_batch)

### Test Single Models

In [15]:
import pandas as pd
import torch
from sklearn.metrics import accuracy_score
from tqdm import tqdm

def evaluate_model(model, data_loader):
    model.eval() 
    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(data_loader, total=len(data_loader)):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=1)

            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    df = pd.DataFrame({'Actual': all_labels, 'Predicted': all_predictions})

    overall_accuracy = accuracy_score(df['Actual'], df['Predicted'])
    print(f"Overall Accuracy: {overall_accuracy:.4f}")

    confusion_matrix = pd.crosstab(df['Actual'], df['Predicted'], rownames=['Actual'], colnames=['Predicted'], margins=True)
    print("Confusion Matrix:\n", confusion_matrix)

    class_accuracies = df.groupby('Actual').apply(lambda x: accuracy_score(x['Actual'], x['Predicted']))
    print("Accuracy per class:\n", class_accuracies)

    return overall_accuracy

In [16]:
evaluate_model(bert, bert_hans_loader)

100%|██████████| 938/938 [00:52<00:00, 17.82it/s]


Overall Accuracy: 0.5616
Confusion Matrix:
 Predicted      0      1    2    All
Actual                             
0           9901   5014   85  15000
1           7975   6948   77  15000
All        17876  11962  162  30000
Accuracy per class:
 Actual
0    0.660067
1    0.463200
dtype: float64


0.5616333333333333

In [17]:
evaluate_model(albert, albert_hans_loader)

100%|██████████| 938/938 [00:54<00:00, 17.17it/s]


Overall Accuracy: 0.4983
Confusion Matrix:
 Predicted      0    1  2    All
Actual                         
0          14846  153  1  15000
1          14893  103  4  15000
All        29739  256  5  30000
Accuracy per class:
 Actual
0    0.989733
1    0.006867
dtype: float64


0.4983

In [18]:
evaluate_model(roberta, roberta_hans_loader)

100%|██████████| 938/938 [00:52<00:00, 17.95it/s]


Overall Accuracy: 0.4292
Confusion Matrix:
 Predicted      0     1     2    All
Actual                             
0          11887   561  2552  15000
1          12219   990  1791  15000
All        24106  1551  4343  30000
Accuracy per class:
 Actual
0    0.792467
1    0.066000
dtype: float64


0.42923333333333336

### Ensemble 2 Models

In [58]:
import pandas as pd
import torch
from sklearn.metrics import accuracy_score

def evaluate_ensemble(model1, model2, model1_loader, model2_loader, ensemble_weights):
    model1.eval() 
    model2.eval()
    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for a_batch, b_batch in tqdm(zip(model1_loader, model2_loader), total=len(model1_loader)):
            a_input_ids = a_batch['input_ids'].to(device)
            a_attention_mask = a_batch['attention_mask'].to(device)
            a_labels = a_batch['labels'].to(device)

            a_outputs = model1(input_ids=a_input_ids, attention_mask=a_attention_mask)
            a_logits = a_outputs.logits
            a_probs = torch.softmax(a_logits, dim=1)

            b_input_ids = b_batch['input_ids'].to(device)
            b_attention_mask = b_batch['attention_mask'].to(device)
            b_labels = b_batch['labels'].to(device)

            b_outputs = model2(input_ids=b_input_ids, attention_mask=b_attention_mask)
            b_logits = b_outputs.logits
            b_probs = torch.softmax(b_logits, dim=1)

            ensemble_probs = ensemble_weights[0] * a_probs + ensemble_weights[1] * b_probs
            ensemble_preds = torch.argmax(ensemble_probs, dim=1)
            all_predictions.extend(ensemble_preds.cpu().numpy())
            all_labels.extend(b_labels.cpu().numpy())

    df = pd.DataFrame({'Actual': all_labels, 'Predicted': all_predictions})

    overall_accuracy = accuracy_score(df['Actual'], df['Predicted'])
    print(f"Overall Accuracy: {overall_accuracy:.4f}")

    # confusion_matrix = pd.crosstab(df['Actual'], df['Predicted'], rownames=['Actual'], colnames=['Predicted'], margins=True)
    # print("Confusion Matrix:\n", confusion_matrix)

    class_accuracies = df.groupby('Actual').apply(lambda x: accuracy_score(x['Actual'], x['Predicted']))
    print("Accuracy per class:\n", class_accuracies)

    return overall_accuracy

### Grid Search for Best Weights (BERT + ALBERT)

In [60]:
step = 0.1
best_score = 0.0
for bert_weight in np.arange(0.1, 0.9 + step, step):
    roberta_weight = 1 - bert_weight
    print(f"Weights = BERT {bert_weight}, ALBERT {roberta_weight}")
    score = evaluate_ensemble(bert, albert, bert_hans_loader, albert_hans_loader, ensemble_weights=[bert_weight, roberta_weight])
    if score > best_score:
        best_score = score
        best_weights = [bert_weight, roberta_weight]

print(best_weights)

Weights = BERT 0.1, ALBERT 0.9


100%|██████████| 938/938 [01:44<00:00,  8.97it/s]


Overall Accuracy: 0.4986
Accuracy per class:
 Actual
0    0.990533
1    0.006667
dtype: float64
Weights = BERT 0.2, ALBERT 0.8


100%|██████████| 938/938 [01:43<00:00,  9.02it/s]


Overall Accuracy: 0.4992
Accuracy per class:
 Actual
0    0.9914
1    0.0070
dtype: float64
Weights = BERT 0.30000000000000004, ALBERT 0.7


100%|██████████| 938/938 [01:44<00:00,  9.00it/s]


Overall Accuracy: 0.5018
Accuracy per class:
 Actual
0    0.979733
1    0.023933
dtype: float64
Weights = BERT 0.4, ALBERT 0.6


100%|██████████| 938/938 [01:43<00:00,  9.04it/s]


Overall Accuracy: 0.5329
Accuracy per class:
 Actual
0    0.908533
1    0.157267
dtype: float64
Weights = BERT 0.5, ALBERT 0.5


100%|██████████| 938/938 [01:44<00:00,  9.02it/s]


Overall Accuracy: 0.5514
Accuracy per class:
 Actual
0    0.838733
1    0.264000
dtype: float64
Weights = BERT 0.6, ALBERT 0.4


100%|██████████| 938/938 [01:44<00:00,  9.01it/s]


Overall Accuracy: 0.5593
Accuracy per class:
 Actual
0    0.785533
1    0.333000
dtype: float64
Weights = BERT 0.7000000000000001, ALBERT 0.29999999999999993


100%|██████████| 938/938 [01:44<00:00,  8.99it/s]


Overall Accuracy: 0.5626
Accuracy per class:
 Actual
0    0.743733
1    0.381467
dtype: float64
Weights = BERT 0.8, ALBERT 0.19999999999999996


100%|██████████| 938/938 [01:45<00:00,  8.89it/s]


Overall Accuracy: 0.5620
Accuracy per class:
 Actual
0    0.708933
1    0.415067
dtype: float64
Weights = BERT 0.9, ALBERT 0.09999999999999998


100%|██████████| 938/938 [01:44<00:00,  8.99it/s]


Overall Accuracy: 0.5616
Accuracy per class:
 Actual
0    0.680867
1    0.442267
dtype: float64
[0.7000000000000001, 0.29999999999999993]


### Grid Search for Best Weights (BERT + RoBERTa)

In [61]:
step = 0.1
best_score = 0.0
for bert_weight in np.arange(0.1, 0.9 + step, step):
    roberta_weight = 1 - bert_weight
    print(f"Weights = BERT {bert_weight}, ROBERTA {roberta_weight}")
    score = evaluate_ensemble(bert, roberta, bert_hans_loader, roberta_hans_loader, ensemble_weights=[bert_weight, roberta_weight])
    if score > best_score:
        best_score = score
        best_weights = [bert_weight, roberta_weight]

print(best_weights)

Weights = BERT 0.1, ROBERTA 0.9


100%|██████████| 938/938 [01:42<00:00,  9.17it/s]


Overall Accuracy: 0.4390
Accuracy per class:
 Actual
0    0.810667
1    0.067400
dtype: float64
Weights = BERT 0.2, ROBERTA 0.8


100%|██████████| 938/938 [01:40<00:00,  9.33it/s]


Overall Accuracy: 0.4501
Accuracy per class:
 Actual
0    0.828067
1    0.072200
dtype: float64
Weights = BERT 0.30000000000000004, ROBERTA 0.7


100%|██████████| 938/938 [01:41<00:00,  9.26it/s]


Overall Accuracy: 0.4654
Accuracy per class:
 Actual
0    0.848800
1    0.082067
dtype: float64
Weights = BERT 0.4, ROBERTA 0.6


100%|██████████| 938/938 [01:41<00:00,  9.28it/s]


Overall Accuracy: 0.4883
Accuracy per class:
 Actual
0    0.868667
1    0.107867
dtype: float64
Weights = BERT 0.5, ROBERTA 0.5


100%|██████████| 938/938 [01:40<00:00,  9.30it/s]


Overall Accuracy: 0.5166
Accuracy per class:
 Actual
0    0.862533
1    0.170667
dtype: float64
Weights = BERT 0.6, ROBERTA 0.4


100%|██████████| 938/938 [01:41<00:00,  9.22it/s]


Overall Accuracy: 0.5442
Accuracy per class:
 Actual
0    0.8238
1    0.2646
dtype: float64
Weights = BERT 0.7000000000000001, ROBERTA 0.29999999999999993


100%|██████████| 938/938 [01:41<00:00,  9.23it/s]


Overall Accuracy: 0.5565
Accuracy per class:
 Actual
0    0.774533
1    0.338400
dtype: float64
Weights = BERT 0.8, ROBERTA 0.19999999999999996


100%|██████████| 938/938 [01:41<00:00,  9.25it/s]


Overall Accuracy: 0.5615
Accuracy per class:
 Actual
0    0.731533
1    0.391533
dtype: float64
Weights = BERT 0.9, ROBERTA 0.09999999999999998


100%|██████████| 938/938 [01:41<00:00,  9.25it/s]


Overall Accuracy: 0.5619
Accuracy per class:
 Actual
0    0.691933
1    0.431800
dtype: float64
[0.9, 0.09999999999999998]


### Ensemble BERT + ALBERT + RoBERTa 

In [62]:
import pandas as pd
import torch
from sklearn.metrics import accuracy_score
from torch.utils.data import DataLoader

def evaluate_ensemble_3(model1, model2, model3, model1_loader, model2_loader, model3_loader, ensemble_weights):
    model1.eval()  # Set the model to evaluation mode
    model2.eval()
    model3.eval()
    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for a_batch, b_batch, c_batch in tqdm(zip(model1_loader, model2_loader, model3_loader), total=len(model1_loader)):
            a_input_ids = a_batch['input_ids'].to(device)
            a_attention_mask = a_batch['attention_mask'].to(device)
            a_labels = a_batch['labels'].to(device)

            a_outputs = model1(input_ids=a_input_ids, attention_mask=a_attention_mask)
            a_logits = a_outputs.logits
            a_probs = torch.softmax(a_logits, dim=1)

            b_input_ids = b_batch['input_ids'].to(device)
            b_attention_mask = b_batch['attention_mask'].to(device)
            b_labels = b_batch['labels'].to(device)

            b_outputs = model2(input_ids=b_input_ids, attention_mask=b_attention_mask)
            b_logits = b_outputs.logits
            b_probs = torch.softmax(b_logits, dim=1)

            c_input_ids = c_batch['input_ids'].to(device)
            c_attention_mask = c_batch['attention_mask'].to(device)
            c_labels = c_batch['labels'].to(device)

            c_outputs = model3(input_ids=c_input_ids, attention_mask=c_attention_mask)
            c_logits = c_outputs.logits
            c_probs = torch.softmax(c_logits, dim=1)

            ensemble_probs = ensemble_weights[0] * a_probs + ensemble_weights[1] * b_probs + ensemble_weights[2] * c_probs
            ensemble_preds = torch.argmax(ensemble_probs, dim=1)
            all_predictions.extend(ensemble_preds.cpu().numpy())
            all_labels.extend(c_labels.cpu().numpy())

    df = pd.DataFrame({'Actual': all_labels, 'Predicted': all_predictions})

    overall_accuracy = accuracy_score(df['Actual'], df['Predicted'])
    print(f"Overall Accuracy: {overall_accuracy:.4f}")

    # confusion_matrix = pd.crosstab(df['Actual'], df['Predicted'], rownames=['Actual'], colnames=['Predicted'], margins=True)
    # print("Confusion Matrix:\n", confusion_matrix)

    class_accuracies = df.groupby('Actual').apply(lambda x: accuracy_score(x['Actual'], x['Predicted']))
    print("Accuracy per class:\n", class_accuracies)

    return overall_accuracy

In [63]:
evaluate_ensemble_3(bert, albert, roberta, bert_hans_loader, albert_hans_loader, roberta_hans_loader, ensemble_weights=[0.33, 0.33, 0.33])

100%|██████████| 938/938 [02:33<00:00,  6.12it/s]


Overall Accuracy: 0.5088
Accuracy per class:
 Actual
0    0.957600
1    0.059933
dtype: float64


0.5087666666666667