# Ensemble Models (ALBERT, BERT, RoBERTa) on MultiNLI Test

### Load Libraries and Check Hardware

In [1]:
import pandas as pd
import torch
import numpy as np
from torch.utils.data import Dataset, TensorDataset, DataLoader, SequentialSampler, RandomSampler
from torch.nn.utils.rnn import pad_sequence

In [2]:
!nvidia-smi

Sat Dec 16 01:02:45 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  On   | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0    28W / 300W |      0MiB / 16384MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


### Load MultiNLI Dataset

In [4]:
from nlp import load_dataset
train_data = load_dataset('multi_nli', split='train')
val_data = load_dataset('multi_nli', split='validation_matched')
test_data = load_dataset('multi_nli', split='validation_mismatched')

2023-12-16 01:02:46.894443: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
# Convert to DataFrame
train_dataset = [item for item in train_data]
val_dataset = [item for item in val_data]
test_dataset = [item for item in test_data]
train_df = pd.DataFrame(train_dataset)
val_df = pd.DataFrame(val_dataset)
test_df = pd.DataFrame(test_dataset)

# Load in negation labels
group_labels = pd.read_csv('metadata_preset.csv')

In [6]:
# Organize dataframe

train_group_labels = group_labels[group_labels['split'] == 0]['sentence2_has_negation']
val_group_labels = group_labels[group_labels['split'] == 1]['sentence2_has_negation'].reset_index()['sentence2_has_negation']
test_group_labels  = group_labels[group_labels['split'] == 2]['sentence2_has_negation'].reset_index()['sentence2_has_negation']

In [7]:
train_df = train_df.merge(train_group_labels, left_index=True, right_index=True)
val_df = val_df.merge(val_group_labels, left_index=True, right_index=True)
test_df = test_df.merge(test_group_labels, left_index=True, right_index=True)

In [8]:
train_df = train_df.rename(columns={"premise": "sentence1", "hypothesis": "sentence2", "label": "gold_label", "sentence2_has_negation":"negation"})
val_df = val_df.rename(columns={"premise": "sentence1", "hypothesis": "sentence2", "label": "gold_label", "sentence2_has_negation":"negation"})
test_df = test_df.rename(columns={"premise": "sentence1", "hypothesis": "sentence2", "label": "gold_label", "sentence2_has_negation":"negation"})

label_map = {0: 'entailment', 1: 'neutral', 2: 'contradiction'}
train_df['gold_label'] = train_df['gold_label'].map(label_map)
val_df['gold_label'] = val_df['gold_label'].map(label_map)
test_df['gold_label'] = test_df['gold_label'].map(label_map)

In [9]:
val_df['class'] = val_df['gold_label'].astype(str) + "_" + val_df['negation'].astype(str)
train_df['class'] = train_df['gold_label'].astype(str) + "_" + train_df['negation'].astype(str)
test_df['class'] = test_df['gold_label'].astype(str) + "_" + test_df['negation'].astype(str)

In [10]:
train_df['class'].value_counts()

class
entailment_0       128042
neutral_0          127135
contradiction_0    109504
contradiction_1     21399
neutral_1            3765
entailment_1         2857
Name: count, dtype: int64

In [11]:
val_df['class'].value_counts()

class
entailment_0       3388
neutral_0          3036
contradiction_0    2709
contradiction_1     504
entailment_1         91
neutral_1            87
Name: count, dtype: int64

In [12]:
test_df['class'].value_counts()

class
entailment_0       3391
neutral_0          3044
contradiction_0    2696
contradiction_1     544
neutral_1            85
entailment_1         72
Name: count, dtype: int64

In [13]:
train_df = train_df.dropna()
val_df = val_df.dropna()
test_df = test_df.dropna()

In [14]:
train_df['sentence1'] = train_df['sentence1'].astype(str)
train_df['sentence2'] = train_df['sentence2'].astype(str)

In [15]:
val_df['sentence1'] = val_df['sentence1'].astype(str)
val_df['sentence2'] = val_df['sentence2'].astype(str)

In [16]:
test_df['sentence1'] = test_df['sentence1'].astype(str)
test_df['sentence2'] = test_df['sentence2'].astype(str)

In [17]:
train_df = train_df[(train_df['sentence1'].str.split().str.len() > 0) & (train_df['sentence2'].str.split().str.len() > 0)]
val_df = val_df[(val_df['sentence1'].str.split().str.len() > 0) & (val_df['sentence2'].str.split().str.len() > 0)]
test_df = test_df[(test_df['sentence1'].str.split().str.len() > 0) & (test_df['sentence2'].str.split().str.len() > 0)]

In [18]:
train_df

Unnamed: 0,sentence2,gold_label,sentence1,negation,class
0,Product and geography are what make cream skim...,neutral,Conceptually cream skimming has two basic dime...,0,neutral_0
1,You lose the things to the following level if ...,entailment,you know during the season and i guess at at y...,0,entailment_0
2,A member of my team will execute your orders w...,entailment,One of our number will carry out your instruct...,0,entailment_0
3,This information belongs to them.,entailment,How do you know? All this is their information...,0,entailment_0
4,The tennis shoes have a range of prices.,neutral,yeah i tell you what though if you go price so...,0,neutral_0
...,...,...,...,...,...
392697,California cannot do any better.,contradiction,"Clearly, California can - and must - do better.",0,contradiction_0
392698,So many of the original buildings had been rep...,neutral,It was once regarded as the most beautiful str...,0,neutral_0
392699,The tradition of houseboats originated while t...,entailment,Houseboats are a beautifully preserved traditi...,0,entailment_0
392700,The obituaries were beautiful and written in k...,neutral,Obituaries fondly recalled his on-air debates ...,0,neutral_0


### Define Dataset Classes for Each Model

In [19]:
import torch
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from transformers import BertTokenizer, AlbertTokenizer

class MNLIDataBert(Dataset):

  def __init__(self, train_df, val_df, test_df):
    self.label_dict = {'entailment': 0, 'contradiction': 1, 'neutral': 2}
    
    # Add in negation classes (spurious feature)
    self.negation_dict = {'entailment_0': 0, 'entailment_1': 1, 'contradiction_0': 2, 'contradiction_1': 3, 'neutral_0': 4, 'neutral_1': 5}

    self.train_df = train_df
    self.val_df = val_df
    self.test_df = test_df

    self.base_path = '/content/'
    self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
    self.train_data = None
    self.val_data = None
    self.test_data = None
    self.init_data()

  def init_data(self):
    self.train_data = self.load_data(self.train_df)
    self.val_data = self.load_data(self.val_df)
    self.test_data = self.load_data(self.test_df)

  def load_data(self, df):
    MAX_LEN = 512
    token_ids = []
    mask_ids = []
    seg_ids = []
    y = []
    n = []

    premise_list = df['sentence1'].to_list()
    hypothesis_list = df['sentence2'].to_list()
    label_list = df['gold_label'].to_list()
    negation_list = df['class'].to_list()

    for (premise, hypothesis, label, negation) in zip(premise_list, hypothesis_list, label_list, negation_list):
      premise_id = self.tokenizer.encode(premise, add_special_tokens = False)
      hypothesis_id = self.tokenizer.encode(hypothesis, add_special_tokens = False)
      pair_token_ids = [self.tokenizer.cls_token_id] + premise_id + [self.tokenizer.sep_token_id] + hypothesis_id + [self.tokenizer.sep_token_id]
      premise_len = len(premise_id)
      hypothesis_len = len(hypothesis_id)

      segment_ids = torch.tensor([0] * (premise_len + 2) + [1] * (hypothesis_len + 1)) 
      attention_mask_ids = torch.tensor([1] * (premise_len + hypothesis_len + 3)) 

      token_ids.append(torch.tensor(pair_token_ids))
      seg_ids.append(segment_ids)
      mask_ids.append(attention_mask_ids)
      y.append(self.label_dict[label])
      n.append(self.negation_dict[negation])
    
    token_ids = pad_sequence(token_ids, batch_first=True)
    mask_ids = pad_sequence(mask_ids, batch_first=True)
    seg_ids = pad_sequence(seg_ids, batch_first=True)
    y = torch.tensor(y)
    n = torch.tensor(n)
    dataset = TensorDataset(token_ids, mask_ids, seg_ids, y, n)
    print(len(dataset))
    return dataset

  def get_data_loaders(self, batch_size=16, shuffle=False):
    train_loader = DataLoader(
      self.train_data,
      shuffle=shuffle,
      batch_size=batch_size,
      sampler=SequentialSampler(self.train_data)
    )

    val_loader = DataLoader(
      self.val_data,
      shuffle=shuffle,
      batch_size=batch_size,
      sampler=SequentialSampler(self.val_data)
    )

    test_loader = DataLoader(
      self.test_data,
      shuffle=shuffle,
      batch_size=batch_size,
      sampler=SequentialSampler(self.test_data)
    )

    return train_loader, val_loader, test_loader

In [20]:
import torch
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from transformers import BertTokenizer, AlbertTokenizer

class MNLIDataAlbert(Dataset):

  def __init__(self, train_df, val_df, test_df):
    self.label_dict = {'entailment': 0, 'contradiction': 1, 'neutral': 2}

    # Add in negation classes (spurious feature)
    self.negation_dict = {'entailment_0': 0, 'entailment_1': 1, 'contradiction_0': 2, 'contradiction_1': 3, 'neutral_0': 4, 'neutral_1': 5}

    self.train_df = train_df
    self.val_df = val_df
    self.test_df = test_df

    self.base_path = '/content/'
    self.tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', do_lower_case=True)
    self.train_data = None
    self.val_data = None
    self.test_data = None
    self.init_data()

  def init_data(self):
    self.train_data = self.load_data(self.train_df)
    self.val_data = self.load_data(self.val_df)
    self.test_data = self.load_data(self.test_df)

  def load_data(self, df):
    MAX_LEN = 512
    token_ids = []
    mask_ids = []
    seg_ids = []
    y = []
    n = []

    premise_list = df['sentence1'].to_list()
    hypothesis_list = df['sentence2'].to_list()
    label_list = df['gold_label'].to_list()
    negation_list = df['class'].to_list()

    for (premise, hypothesis, label, negation) in zip(premise_list, hypothesis_list, label_list, negation_list):
      premise_id = self.tokenizer.encode(premise, add_special_tokens = False)
      hypothesis_id = self.tokenizer.encode(hypothesis, add_special_tokens = False)
      pair_token_ids = [self.tokenizer.cls_token_id] + premise_id + [self.tokenizer.sep_token_id] + hypothesis_id + [self.tokenizer.sep_token_id]
      premise_len = len(premise_id)
      hypothesis_len = len(hypothesis_id)

      segment_ids = torch.tensor([0] * (premise_len + 2) + [1] * (hypothesis_len + 1)) 
      attention_mask_ids = torch.tensor([1] * (premise_len + hypothesis_len + 3)) 

      token_ids.append(torch.tensor(pair_token_ids))
      seg_ids.append(segment_ids)
      mask_ids.append(attention_mask_ids)
      y.append(self.label_dict[label])
      n.append(self.negation_dict[negation])
    
    token_ids = pad_sequence(token_ids, batch_first=True)
    mask_ids = pad_sequence(mask_ids, batch_first=True)
    seg_ids = pad_sequence(seg_ids, batch_first=True)
    y = torch.tensor(y)
    n = torch.tensor(n)
    dataset = TensorDataset(token_ids, mask_ids, seg_ids, y, n)
    print(len(dataset))
    return dataset

  def get_data_loaders(self, batch_size=16, shuffle=False):
    train_loader = DataLoader(
      self.train_data,
      shuffle=shuffle,
      batch_size=batch_size,
      sampler=SequentialSampler(self.train_data)
    )

    val_loader = DataLoader(
      self.val_data,
      shuffle=shuffle,
      batch_size=batch_size,
      sampler=SequentialSampler(self.val_data)
    )

    test_loader = DataLoader(
      self.test_data,
      shuffle=shuffle,
      batch_size=batch_size,
      sampler=SequentialSampler(self.test_data)
    )

    return train_loader, val_loader, test_loader

In [43]:
import torch
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from transformers import BertTokenizer, AlbertTokenizer, RobertaTokenizer

class MNLIDataRoberta(Dataset):

  def __init__(self, train_df, val_df, test_df):
    self.label_dict = {'entailment': 0, 'contradiction': 1, 'neutral': 2}

    # Add in negation classes (spurious feature)
    self.negation_dict = {'entailment_0': 0, 'entailment_1': 1, 'contradiction_0': 2, 'contradiction_1': 3, 'neutral_0': 4, 'neutral_1': 5}

    self.train_df = train_df
    self.val_df = val_df
    self.test_df = test_df

    self.base_path = '/content/'
    self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    self.train_data = None
    self.val_data = None
    self.test_data = None
    self.init_data()

  def init_data(self):
    self.train_data = self.load_data(self.train_df)
    self.val_data = self.load_data(self.val_df)
    self.test_data = self.load_data(self.test_df)

  def load_data(self, df):
    MAX_LEN = 512
    token_ids = []
    mask_ids = []
    y = []
    n = []

    premise_list = df['sentence1'].to_list()
    hypothesis_list = df['sentence2'].to_list()
    label_list = df['gold_label'].to_list()
    negation_list = df['class'].to_list()

    for (premise, hypothesis, label, negation) in zip(premise_list, hypothesis_list, label_list, negation_list):
      premise_id = self.tokenizer.encode(premise, add_special_tokens = False)
      hypothesis_id = self.tokenizer.encode(hypothesis, add_special_tokens = False)
      pair_token_ids = [self.tokenizer.bos_token_id] + premise_id + [self.tokenizer.sep_token_id] + hypothesis_id + [self.tokenizer.eos_token_id]

      premise_len = len(premise_id)
      hypothesis_len = len(hypothesis_id)

      attention_mask_ids = torch.tensor([1] * (premise_len + hypothesis_len + 3))  

      token_ids.append(torch.tensor(pair_token_ids))
      mask_ids.append(attention_mask_ids)
      y.append(self.label_dict[label])
      n.append(self.negation_dict[negation])
    
    token_ids = pad_sequence(token_ids, batch_first=True)
    mask_ids = pad_sequence(mask_ids, batch_first=True)
    y = torch.tensor(y)
    n = torch.tensor(n)
    dataset = TensorDataset(token_ids, mask_ids, y, n)
    print(len(dataset))
    return dataset

  def get_data_loaders(self, batch_size=32, shuffle=False):
    train_loader = DataLoader(
      self.train_data,
      shuffle=shuffle,
      batch_size=batch_size,
      sampler=SequentialSampler(self.train_data)
    )

    val_loader = DataLoader(
      self.val_data,
      shuffle=shuffle,
      batch_size=batch_size,
      sampler=SequentialSampler(self.val_data)
    )

    test_loader = DataLoader(
      self.test_data,
      shuffle=shuffle,
      batch_size=32,
      sampler=SequentialSampler(self.test_data)
    )

    return train_loader, val_loader, test_loader

### Get Dataloaders

In [44]:
# Only need validation and test sets, since we have trained model already
bert_mnli_dataset = MNLIDataBert(train_df[:1], val_df[:], test_df[:])
albert_mnli_dataset = MNLIDataAlbert(train_df[:1], val_df[:], test_df[:])
roberta_mnli_dataset = MNLIDataRoberta(train_df[:1], val_df[:], test_df[:])

1
9815
9832
1
9815
9832
1
9815
9832


In [45]:
_, bert_val_loader, bert_test_loader = bert_mnli_dataset.get_data_loaders(batch_size=8)
_, albert_val_loader, albert_test_loader = albert_mnli_dataset.get_data_loaders(batch_size=8)
_, roberta_val_loader, roberta_test_loader = roberta_mnli_dataset.get_data_loaders(batch_size=8)

### Load Full Model (393K) checkpoints

In [46]:
from transformers import BertForSequenceClassification, AlbertForSequenceClassification, RobertaForSequenceClassification

bert = BertForSequenceClassification.from_pretrained('/home/allen/other/BERT_Full', num_labels=3)
bert.to(device)
albert = AlbertForSequenceClassification.from_pretrained("/home/allen/other/ALBERT_Full", num_labels=3)
albert.to(device)
roberta = RobertaForSequenceClassification.from_pretrained("/home/allen/other/RoBERTa_Full", num_labels=3)
roberta.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

### Evaluate Individual Model Validation Accuracies

#### BERT

In [47]:
import time
from tqdm import tqdm
import seaborn as sns
from collections import defaultdict
import matplotlib.pyplot as plt

negation_dict = {0: 'entailment_0',
 1: 'entailment_1',
 2: 'contradiction_0',
 3: 'contradiction_1',
 4: 'neutral_0',
 5: 'neutral_1'}

def multi_acc(y_pred, y_test):
  acc = (torch.log_softmax(y_pred, dim=1).argmax(dim=1) == y_test).sum().float() / float(y_test.size(0))
  return acc

def evaluate_bert(bert, bert_val_loader):  
  bert.eval()
  total_val_acc  = 0
  total_val_loss = 0

  class_correct_predictions = defaultdict(int)
  class_total_predictions = defaultdict(int)

  with torch.no_grad():
    for batch_bert in tqdm(bert_val_loader, total=len(bert_val_loader)):
      # BERT
      b_pair_token_ids, b_mask_ids, b_seg_ids, b_y, b_n = batch_bert
      b_pair_token_ids = b_pair_token_ids.to(device)
      b_mask_ids = b_mask_ids.to(device)
      b_seg_ids = b_seg_ids.to(device)
      b_labels = b_y.to(device)
      b_negation = b_n.to(device)

      b_loss, b_logits = bert(b_pair_token_ids, 
                            token_type_ids=b_seg_ids, 
                            attention_mask=b_mask_ids, 
                            labels=b_labels).values()
      b_probs = torch.softmax(b_logits, dim=1)
      preds = torch.argmax(b_probs, dim=1)

      # Compute accuracy per negation class
      for i in range(len(b_labels)):
        class_identifier = negation_dict[int(b_negation[i].item())]
        class_correct_predictions[class_identifier] += int(preds[i] == b_labels[i])
        class_total_predictions[class_identifier] += 1

      correct_preds = (preds == b_labels.to(device)).sum().item() / float(b_labels.size(0))
      total_val_acc += correct_preds
      total_val_loss += (b_loss.item())

    val_acc  = total_val_acc/len(bert_val_loader)
    val_loss = total_val_loss/len(bert_val_loader)
  
    print(f'Test_loss: {val_loss:.4f} Test_acc: {val_acc:.4f}')

    for class_id, correct_count in sorted(class_correct_predictions.items()):
      accuracy = correct_count / class_total_predictions[class_id]
      print(f"--- Class {class_id} accuracy: {accuracy:.4f}]")
    print('\n')

In [48]:
evaluate_bert(bert, bert_test_loader)

100%|██████████| 1229/1229 [00:37<00:00, 32.35it/s]

Test_loss: 0.5649 Test_acc: 0.8315
--- Class contradiction_0 accuracy: 0.8179]
--- Class contradiction_1 accuracy: 0.9430]
--- Class entailment_0 accuracy: 0.8493]
--- Class entailment_1 accuracy: 0.7222]
--- Class neutral_0 accuracy: 0.8121]
--- Class neutral_1 accuracy: 0.6235]







In [36]:
evaluate_bert(bert, bert_val_loader)

100%|██████████| 1227/1227 [00:42<00:00, 29.05it/s]

Test_loss: 0.5864 Test_acc: 0.8329
--- Class contradiction_0 accuracy: 0.8357]
--- Class contradiction_1 accuracy: 0.9464]
--- Class entailment_0 accuracy: 0.8297]
--- Class entailment_1 accuracy: 0.7802]
--- Class neutral_0 accuracy: 0.8221]
--- Class neutral_1 accuracy: 0.6437]







#### ALBERT

In [27]:
import time
from tqdm import tqdm
import seaborn as sns
from collections import defaultdict
import matplotlib.pyplot as plt

negation_dict = {0: 'entailment_0',
 1: 'entailment_1',
 2: 'contradiction_0',
 3: 'contradiction_1',
 4: 'neutral_0',
 5: 'neutral_1'}

def multi_acc(y_pred, y_test):
  acc = (torch.log_softmax(y_pred, dim=1).argmax(dim=1) == y_test).sum().float() / float(y_test.size(0))
  return acc

def evaluate_albert(albert, albert_val_loader):  
  albert.eval()
  total_val_acc  = 0
  total_val_loss = 0

  class_correct_predictions = defaultdict(int)
  class_total_predictions = defaultdict(int)

  with torch.no_grad():
    for batch_albert in tqdm(albert_val_loader, total=len(albert_val_loader)):      
      # ALBERT
      a_pair_token_ids, a_mask_ids, a_seg_ids, a_y, a_n = batch_albert
      a_pair_token_ids = a_pair_token_ids.to(device)
      a_mask_ids = a_mask_ids.to(device)
      a_seg_ids = a_seg_ids.to(device)
      a_labels = a_y.to(device)
      a_negation = a_n.to(device)

      a_loss, a_logits = albert(a_pair_token_ids, 
                                token_type_ids=a_seg_ids, 
                                attention_mask=a_mask_ids, 
                                labels=a_labels).values()
      a_probs = torch.softmax(a_logits, dim=1)
      preds = torch.argmax(a_probs, dim=1)

      # Compute accuracy per negation class
      for i in range(len(a_labels)):
        class_identifier = negation_dict[int(a_negation[i].item())]
        class_correct_predictions[class_identifier] += int(preds[i] == a_labels[i])
        class_total_predictions[class_identifier] += 1

      correct_preds = (preds == a_labels.to(device)).sum().item() / float(a_labels.size(0))
      total_val_acc += correct_preds
      total_val_loss += a_loss.item()

    val_acc  = total_val_acc/len(albert_val_loader)
    val_loss = total_val_loss/len(albert_val_loader)
  
    print(f'Test_loss: {val_loss:.4f} Test_acc: {val_acc:.4f}')

    for class_id, correct_count in sorted(class_correct_predictions.items()):
      accuracy = correct_count / class_total_predictions[class_id]
      print(f"--- Class {class_id} accuracy: {accuracy:.4f}]")
    print('\n')

In [28]:
evaluate_albert(albert, albert_test_loader)

100%|██████████| 1229/1229 [00:44<00:00, 27.53it/s]

Test_loss: 0.4565 Test_acc: 0.8459
--- Class contradiction_0 accuracy: 0.8349]
--- Class contradiction_1 accuracy: 0.9688]
--- Class entailment_0 accuracy: 0.8596]
--- Class entailment_1 accuracy: 0.7917]
--- Class neutral_0 accuracy: 0.8249]
--- Class neutral_1 accuracy: 0.6588]







In [37]:
evaluate_albert(albert, albert_val_loader)

100%|██████████| 1227/1227 [00:50<00:00, 24.45it/s]

Test_loss: 0.4549 Test_acc: 0.8451
--- Class contradiction_0 accuracy: 0.8365]
--- Class contradiction_1 accuracy: 0.9464]
--- Class entailment_0 accuracy: 0.8403]
--- Class entailment_1 accuracy: 0.7582]
--- Class neutral_0 accuracy: 0.8475]
--- Class neutral_1 accuracy: 0.7241]







#### RoBERTa

In [29]:
import time
from tqdm import tqdm
import seaborn as sns
from collections import defaultdict
import matplotlib.pyplot as plt

negation_dict = {0: 'entailment_0',
 1: 'entailment_1',
 2: 'contradiction_0',
 3: 'contradiction_1',
 4: 'neutral_0',
 5: 'neutral_1'}

def multi_acc(y_pred, y_test):
  acc = (torch.log_softmax(y_pred, dim=1).argmax(dim=1) == y_test).sum().float() / float(y_test.size(0))
  return acc

def evaluate_roberta(roberta, roberta_val_loader):  
  roberta.eval()
  total_val_acc  = 0
  total_val_loss = 0

  class_correct_predictions = defaultdict(int)
  class_total_predictions = defaultdict(int)

  with torch.no_grad():
    for batch_roberta in tqdm(roberta_val_loader, total=len(roberta_val_loader)):
      r_pair_token_ids, r_mask_ids, r_y, r_n = batch_roberta
      r_pair_token_ids = r_pair_token_ids.to(device)
      r_mask_ids = r_mask_ids.to(device)
      r_labels = r_y.to(device)
      r_negation = r_n.to(device)

      r_loss, r_logits = roberta(r_pair_token_ids, 
                                attention_mask=r_mask_ids, 
                                labels=r_labels).values()
      r_probs = torch.softmax(r_logits, dim=1)
      
      preds = torch.argmax(r_probs, dim=1)

      # Compute accuracy per negation class
      for i in range(len(r_labels)):
        class_identifier = negation_dict[int(r_negation[i].item())]
        class_correct_predictions[class_identifier] += int(preds[i] == r_labels[i])
        class_total_predictions[class_identifier] += 1

      correct_preds = (preds == r_labels.to(device)).sum().item() / float(r_labels.size(0))
      total_val_acc += correct_preds
      total_val_loss += r_loss.item()

    val_acc  = total_val_acc/len(roberta_val_loader)
    val_loss = total_val_loss/len(roberta_val_loader)
  
    print(f'Test_loss: {val_loss:.4f} Test_acc: {val_acc:.4f}')

    for class_id, correct_count in sorted(class_correct_predictions.items()):
      accuracy = correct_count / class_total_predictions[class_id]
      print(f"--- Class {class_id} accuracy: {accuracy:.4f}]")
    print('\n')

In [30]:
evaluate_roberta(roberta, roberta_test_loader)

100%|██████████| 1229/1229 [00:39<00:00, 30.90it/s]

Test_loss: 0.3754 Test_acc: 0.8737
--- Class contradiction_0 accuracy: 0.8709]
--- Class contradiction_1 accuracy: 0.9798]
--- Class entailment_0 accuracy: 0.8924]
--- Class entailment_1 accuracy: 0.8194]
--- Class neutral_0 accuracy: 0.8407]
--- Class neutral_1 accuracy: 0.7647]







In [38]:
evaluate_roberta(roberta, roberta_val_loader)

100%|██████████| 1227/1227 [00:40<00:00, 30.03it/s]

Test_loss: 0.3723 Test_acc: 0.8754
--- Class contradiction_0 accuracy: 0.8834]
--- Class contradiction_1 accuracy: 0.9702]
--- Class entailment_0 accuracy: 0.8878]
--- Class entailment_1 accuracy: 0.8571]
--- Class neutral_0 accuracy: 0.8449]
--- Class neutral_1 accuracy: 0.6782]







### BERT + ALBERT Ensemble

In [57]:
import time
from tqdm import tqdm
import seaborn as sns
from collections import defaultdict
import matplotlib.pyplot as plt

negation_dict = {0: 'entailment_0',
 1: 'entailment_1',
 2: 'contradiction_0',
 3: 'contradiction_1',
 4: 'neutral_0',
 5: 'neutral_1'}

def multi_acc(y_pred, y_test):
  acc = (torch.log_softmax(y_pred, dim=1).argmax(dim=1) == y_test).sum().float() / float(y_test.size(0))
  return acc

def evaluate_bert_albert(bert, albert, bert_val_loader, albert_val_loader, ensemble_weights=[0.5,0.5]):  
  bert.eval()
  albert.eval()
  total_val_acc  = 0
  total_val_loss = 0

  class_correct_predictions = defaultdict(int)
  class_total_predictions = defaultdict(int)

  with torch.no_grad():
    for batch_bert, batch_albert in tqdm(zip(bert_val_loader, albert_val_loader), total=min(len(bert_val_loader), len(albert_val_loader))):
      # BERT
      b_pair_token_ids, b_mask_ids, b_seg_ids, b_y, b_n = batch_bert
      b_pair_token_ids = b_pair_token_ids.to(device)
      b_mask_ids = b_mask_ids.to(device)
      b_seg_ids = b_seg_ids.to(device)
      b_labels = b_y.to(device)
      b_negation = b_n.to(device)

      b_loss, b_logits = bert(b_pair_token_ids, 
                            token_type_ids=b_seg_ids, 
                            attention_mask=b_mask_ids, 
                            labels=b_labels).values()
      b_probs = torch.softmax(b_logits, dim=1)
      # b_preds = torch.argmax(b_probs, dim=1)
      
      # ALBERT
      a_pair_token_ids, a_mask_ids, a_seg_ids, a_y, a_n = batch_albert
      a_pair_token_ids = a_pair_token_ids.to(device)
      a_mask_ids = a_mask_ids.to(device)
      a_seg_ids = a_seg_ids.to(device)
      a_labels = a_y.to(device)
      a_negation = a_n.to(device)
      a_loss, a_logits = albert(a_pair_token_ids, 
                                token_type_ids=a_seg_ids, 
                                attention_mask=a_mask_ids, 
                                labels=a_labels).values()
      a_probs = torch.softmax(a_logits, dim=1)

      # Weighted Majority Voting Attempt
      # a_preds = torch.argmax(a_probs, dim=1)
      # ensemble_preds = torch.zeros_like(b_labels)
      # Compute weighted votes for each class and decide the final prediction
      # for i in range(b_labels.size(0)):
      #   votes = defaultdict(float)
      #   votes[b_preds[i].item()] += ensemble_weights[0]
      #   votes[a_preds[i].item()] += ensemble_weights[1]
      #   ensemble_preds[i] = max(votes, key=votes.get)

      # Apply weights to probabilities 
      ensemble_probs = a_probs * ensemble_weights[0] + b_probs * ensemble_weights[1]
      ensemble_preds = torch.argmax(ensemble_probs, dim=1)

      # Compute accuracy per negation class
      for i in range(len(a_labels)):
        label_name = 'entailment' if a_labels[i] == 0 else 'contradiction' if a_labels[i] == 1 else 'neutral'
        class_identifier = negation_dict[int(a_negation[i].item())]
        class_correct_predictions[class_identifier] += int(ensemble_preds[i] == a_labels[i])
        class_total_predictions[class_identifier] += 1

      correct_preds = (ensemble_preds == a_labels.to(device)).sum().item() / float(a_labels.size(0))
      total_val_acc += correct_preds
      total_val_loss += (b_loss.item() + a_loss.item())/2

    val_acc  = total_val_acc/len(bert_val_loader)
    val_loss = total_val_loss/len(bert_val_loader)
  
    print(f'Test_loss: {val_loss:.4f} Test_acc: {val_acc:.4f}')

    for class_id, correct_count in sorted(class_correct_predictions.items()):
      accuracy = correct_count / class_total_predictions[class_id]
      print(f"--- Class {class_id} accuracy: {accuracy:.4f}]")
    print('\n')

  return float(val_acc)

### BERT + RoBERTa Ensemble

In [65]:
import time
from tqdm import tqdm
import seaborn as sns
from collections import defaultdict
import matplotlib.pyplot as plt

negation_dict = {0: 'entailment_0',
 1: 'entailment_1',
 2: 'contradiction_0',
 3: 'contradiction_1',
 4: 'neutral_0',
 5: 'neutral_1'}

def multi_acc(y_pred, y_test):
  acc = (torch.log_softmax(y_pred, dim=1).argmax(dim=1) == y_test).sum().float() / float(y_test.size(0))
  return acc

def evaluate_bert_roberta(bert, roberta, bert_test_loader, roberta_test_loader, ensemble_weights=[0.5,0.5]):  
  bert.eval()
  roberta.eval()
  total_val_acc  = 0
  total_val_loss = 0

  class_correct_predictions = defaultdict(int)
  class_total_predictions = defaultdict(int)

  with torch.no_grad():
    for batch_bert, batch_roberta in tqdm(zip(bert_test_loader, roberta_test_loader), total=min(len(bert_test_loader), len(roberta_test_loader))):
      # BERT
      b_pair_token_ids, b_mask_ids, b_seg_ids, b_y, b_n = batch_bert
      b_pair_token_ids = b_pair_token_ids.to(device)
      b_mask_ids = b_mask_ids.to(device)
      b_seg_ids = b_seg_ids.to(device)
      b_labels = b_y.to(device)
      b_negation = b_n.to(device)

      b_loss, b_logits = bert(b_pair_token_ids, 
                            token_type_ids=b_seg_ids, 
                            attention_mask=b_mask_ids, 
                            labels=b_labels).values()
      b_probs = torch.softmax(b_logits, dim=1)
      
      # ALBERT
      r_pair_token_ids, r_mask_ids, r_y, r_n = batch_roberta
      r_pair_token_ids = r_pair_token_ids.to(device)
      r_mask_ids = r_mask_ids.to(device)
      r_labels = r_y.to(device)
      r_negation = r_n.to(device)

      r_loss, r_logits = roberta(r_pair_token_ids, 
                                attention_mask=r_mask_ids, 
                                labels=r_labels).values()
      r_probs = torch.softmax(r_logits, dim=1)
      
      ensemble_probs = ensemble_weights[0] * b_probs + ensemble_weights[1] * r_probs
      ensemble_preds = torch.argmax(ensemble_probs, dim=1)

      # Compute accuracy per negation class
      for i in range(len(b_labels)):
        label_name = 'entailment' if b_labels[i] == 0 else 'contradiction' if b_labels[i] == 1 else 'neutral'
        class_identifier = negation_dict[int(b_negation[i].item())]
        class_correct_predictions[class_identifier] += int(ensemble_preds[i] == b_labels[i])
        class_total_predictions[class_identifier] += 1

      correct_preds = (ensemble_preds == b_labels.to(device)).sum().item() / float(b_labels.size(0))
      total_val_acc += correct_preds
      total_val_loss += (b_loss.item() + b_loss.item())/2

    val_acc  = total_val_acc/len(bert_val_loader)
    val_loss = total_val_loss/len(bert_val_loader)
  
    print(f'Test_loss: {val_loss:.4f} Test_acc: {val_acc:.4f}')

    for class_id, correct_count in sorted(class_correct_predictions.items()):
      accuracy = correct_count / class_total_predictions[class_id]
      print(f"--- Class {class_id} accuracy: {accuracy:.4f}]")
    print('\n')
  
  return float(val_acc)

### BERT + ALBERT + RoBERTa Ensemble

In [69]:
import time
from tqdm import tqdm
import seaborn as sns
from collections import defaultdict
import matplotlib.pyplot as plt

negation_dict = {0: 'entailment_0',
 1: 'entailment_1',
 2: 'contradiction_0',
 3: 'contradiction_1',
 4: 'neutral_0',
 5: 'neutral_1'}

def multi_acc(y_pred, y_test):
  acc = (torch.log_softmax(y_pred, dim=1).argmax(dim=1) == y_test).sum().float() / float(y_test.size(0))
  return acc

def evaluate_bert_albert_roberta(bert, roberta, albert, bert_val_loader, roberta_val_loader, albert_val_loader, ensemble_weights=[0.5,0.5,0.5]):  
  bert.eval()
  roberta.eval()
  albert.eval()
  total_val_acc  = 0
  total_val_loss = 0

  class_correct_predictions = defaultdict(int)
  class_total_predictions = defaultdict(int)

  with torch.no_grad():
    for batch_bert, batch_roberta, batch_albert in tqdm(zip(bert_val_loader, roberta_val_loader, albert_val_loader), total=min(len(bert_val_loader), len(roberta_val_loader))):
      # BERT
      b_pair_token_ids, b_mask_ids, b_seg_ids, b_y, b_n = batch_bert
      b_pair_token_ids = b_pair_token_ids.to(device)
      b_mask_ids = b_mask_ids.to(device)
      b_seg_ids = b_seg_ids.to(device)
      b_labels = b_y.to(device)
      b_negation = b_n.to(device)

      b_loss, b_logits = bert(b_pair_token_ids, 
                            token_type_ids=b_seg_ids, 
                            attention_mask=b_mask_ids, 
                            labels=b_labels).values()
      b_probs = torch.softmax(b_logits, dim=1)
      
      # ALBERT
      r_pair_token_ids, r_mask_ids, r_y, r_n = batch_roberta
      r_pair_token_ids = r_pair_token_ids.to(device)
      r_mask_ids = r_mask_ids.to(device)
      r_labels = r_y.to(device)
      r_negation = r_n.to(device)

      r_loss, r_logits = roberta(r_pair_token_ids, 
                                attention_mask=r_mask_ids, 
                                labels=r_labels).values()
      r_probs = torch.softmax(r_logits, dim=1)


      # ALBERT
      a_pair_token_ids, a_mask_ids, a_seg_ids, a_y, a_n = batch_albert
      a_pair_token_ids = a_pair_token_ids.to(device)
      a_mask_ids = a_mask_ids.to(device)
      a_seg_ids = a_seg_ids.to(device)
      a_labels = a_y.to(device)
      a_negation = a_n.to(device)

      a_loss, a_logits = albert(a_pair_token_ids, 
                                token_type_ids=a_seg_ids, 
                                attention_mask=a_mask_ids, 
                                labels=a_labels).values()
      a_probs = torch.softmax(a_logits, dim=1)

      
      ensemble_probs = ensemble_weights[0] * b_probs + ensemble_weights[1] * a_probs + ensemble_weights[2] * r_probs
      ensemble_preds = torch.argmax(ensemble_probs, dim=1)

      # Compute accuracy per negation class
      for i in range(len(b_labels)):
        label_name = 'entailment' if b_labels[i] == 0 else 'contradiction' if b_labels[i] == 1 else 'neutral'
        class_identifier = negation_dict[int(b_negation[i].item())]
        class_correct_predictions[class_identifier] += int(ensemble_preds[i] == b_labels[i])
        class_total_predictions[class_identifier] += 1

      correct_preds = (ensemble_preds == b_labels.to(device)).sum().item() / float(b_labels.size(0))
      total_val_acc += correct_preds
      total_val_loss += (b_loss.item() + b_loss.item())/2

    val_acc  = total_val_acc/len(bert_val_loader)
    val_loss = total_val_loss/len(bert_val_loader)
  
    print(f'Test_loss: {val_loss:.4f} Test_acc: {val_acc:.4f}')

    for class_id, correct_count in sorted(class_correct_predictions.items()):
      accuracy = correct_count / class_total_predictions[class_id]
      print(f"--- Class {class_id} accuracy: {accuracy:.4f}]")
    print('\n')

  return float(val_acc)

### Grid Search for Best Weights

In [66]:
step = 0.1
best_score = 0.0
for bert_weight in np.arange(0.1, 0.9 + step, step):
    roberta_weight = 1 - bert_weight
    print(f"Weights = BERT {bert_weight}, ROBERTA {roberta_weight}")
    score = evaluate_bert_roberta(bert, roberta, bert_val_loader, roberta_val_loader, ensemble_weights=[bert_weight, roberta_weight])
    if score > best_score:
        best_score = score
        best_weights = [bert_weight, roberta_weight]

print(best_weights)

Weights = BERT 0.1, ROBERTA 0.9


100%|██████████| 1227/1227 [01:22<00:00, 14.91it/s]


Test_loss: 0.5864 Test_acc: 0.8774
--- Class contradiction_0 accuracy: 0.8859]
--- Class contradiction_1 accuracy: 0.9722]
--- Class entailment_0 accuracy: 0.8875]
--- Class entailment_1 accuracy: 0.8462]
--- Class neutral_0 accuracy: 0.8488]
--- Class neutral_1 accuracy: 0.7011]


Weights = BERT 0.2, ROBERTA 0.8


100%|██████████| 1227/1227 [01:22<00:00, 14.86it/s]


Test_loss: 0.5864 Test_acc: 0.8788
--- Class contradiction_0 accuracy: 0.8852]
--- Class contradiction_1 accuracy: 0.9722]
--- Class entailment_0 accuracy: 0.8864]
--- Class entailment_1 accuracy: 0.8462]
--- Class neutral_0 accuracy: 0.8547]
--- Class neutral_1 accuracy: 0.7241]


Weights = BERT 0.30000000000000004, ROBERTA 0.7


100%|██████████| 1227/1227 [01:22<00:00, 14.84it/s]


Test_loss: 0.5864 Test_acc: 0.8786
--- Class contradiction_0 accuracy: 0.8852]
--- Class contradiction_1 accuracy: 0.9722]
--- Class entailment_0 accuracy: 0.8837]
--- Class entailment_1 accuracy: 0.8462]
--- Class neutral_0 accuracy: 0.8574]
--- Class neutral_1 accuracy: 0.7126]


Weights = BERT 0.4, ROBERTA 0.6


100%|██████████| 1227/1227 [01:22<00:00, 14.85it/s]


Test_loss: 0.5864 Test_acc: 0.8786
--- Class contradiction_0 accuracy: 0.8867]
--- Class contradiction_1 accuracy: 0.9762]
--- Class entailment_0 accuracy: 0.8799]
--- Class entailment_1 accuracy: 0.8242]
--- Class neutral_0 accuracy: 0.8613]
--- Class neutral_1 accuracy: 0.6782]


Weights = BERT 0.5, ROBERTA 0.5


100%|██████████| 1227/1227 [01:22<00:00, 14.85it/s]


Test_loss: 0.5864 Test_acc: 0.8707
--- Class contradiction_0 accuracy: 0.8800]
--- Class contradiction_1 accuracy: 0.9702]
--- Class entailment_0 accuracy: 0.8660]
--- Class entailment_1 accuracy: 0.8132]
--- Class neutral_0 accuracy: 0.8587]
--- Class neutral_1 accuracy: 0.6667]


Weights = BERT 0.6, ROBERTA 0.4


100%|██████████| 1227/1227 [01:22<00:00, 14.84it/s]


Test_loss: 0.5864 Test_acc: 0.8570
--- Class contradiction_0 accuracy: 0.8597]
--- Class contradiction_1 accuracy: 0.9683]
--- Class entailment_0 accuracy: 0.8539]
--- Class entailment_1 accuracy: 0.8022]
--- Class neutral_0 accuracy: 0.8468]
--- Class neutral_1 accuracy: 0.6552]


Weights = BERT 0.7000000000000001, ROBERTA 0.29999999999999993


100%|██████████| 1227/1227 [01:22<00:00, 14.83it/s]


Test_loss: 0.5864 Test_acc: 0.8487
--- Class contradiction_0 accuracy: 0.8549]
--- Class contradiction_1 accuracy: 0.9583]
--- Class entailment_0 accuracy: 0.8442]
--- Class entailment_1 accuracy: 0.7912]
--- Class neutral_0 accuracy: 0.8376]
--- Class neutral_1 accuracy: 0.6437]


Weights = BERT 0.8, ROBERTA 0.19999999999999996


100%|██████████| 1227/1227 [01:22<00:00, 14.84it/s]


Test_loss: 0.5864 Test_acc: 0.8414
--- Class contradiction_0 accuracy: 0.8450]
--- Class contradiction_1 accuracy: 0.9563]
--- Class entailment_0 accuracy: 0.8377]
--- Class entailment_1 accuracy: 0.7802]
--- Class neutral_0 accuracy: 0.8307]
--- Class neutral_1 accuracy: 0.6437]


Weights = BERT 0.9, ROBERTA 0.09999999999999998


100%|██████████| 1227/1227 [01:22<00:00, 14.84it/s]

Test_loss: 0.5864 Test_acc: 0.8367
--- Class contradiction_0 accuracy: 0.8394]
--- Class contradiction_1 accuracy: 0.9484]
--- Class entailment_0 accuracy: 0.8344]
--- Class entailment_1 accuracy: 0.7802]
--- Class neutral_0 accuracy: 0.8254]
--- Class neutral_1 accuracy: 0.6437]


[0.2, 0.8]





In [58]:
step = 0.1
best_score = 0.0
for bert_weight in np.arange(0.1, 0.9 + step, step):
    albert_weight = 1 - bert_weight
    print(f"Weights = BERT {bert_weight}, ALBERT {albert_weight}")
    score = evaluate_bert_albert(bert, albert, bert_val_loader, albert_val_loader, ensemble_weights=[bert_weight, albert_weight])
    if score > best_score:
        best_score = score
        best_weights = [bert_weight, albert_weight]

print(best_weights)

Weights = BERT 0.1, ALBERT 0.9


100%|██████████| 1227/1227 [01:32<00:00, 13.34it/s]


Test_loss: 0.5206 Test_acc: 0.8367
--- Class contradiction_0 accuracy: 0.8372]
--- Class contradiction_1 accuracy: 0.9484]
--- Class entailment_0 accuracy: 0.8335]
--- Class entailment_1 accuracy: 0.7912]
--- Class neutral_0 accuracy: 0.8281]
--- Class neutral_1 accuracy: 0.6437]


Weights = BERT 0.2, ALBERT 0.8


100%|██████████| 1227/1227 [01:32<00:00, 13.32it/s]


Test_loss: 0.5206 Test_acc: 0.8399
--- Class contradiction_0 accuracy: 0.8431]
--- Class contradiction_1 accuracy: 0.9544]
--- Class entailment_0 accuracy: 0.8338]
--- Class entailment_1 accuracy: 0.8022]
--- Class neutral_0 accuracy: 0.8317]
--- Class neutral_1 accuracy: 0.6437]


Weights = BERT 0.30000000000000004, ALBERT 0.7


100%|██████████| 1227/1227 [01:32<00:00, 13.32it/s]


Test_loss: 0.5206 Test_acc: 0.8441
--- Class contradiction_0 accuracy: 0.8450]
--- Class contradiction_1 accuracy: 0.9563]
--- Class entailment_0 accuracy: 0.8380]
--- Class entailment_1 accuracy: 0.8022]
--- Class neutral_0 accuracy: 0.8386]
--- Class neutral_1 accuracy: 0.6437]


Weights = BERT 0.4, ALBERT 0.6


100%|██████████| 1227/1227 [01:32<00:00, 13.31it/s]


Test_loss: 0.5206 Test_acc: 0.8496
--- Class contradiction_0 accuracy: 0.8483]
--- Class contradiction_1 accuracy: 0.9623]
--- Class entailment_0 accuracy: 0.8439]
--- Class entailment_1 accuracy: 0.8022]
--- Class neutral_0 accuracy: 0.8458]
--- Class neutral_1 accuracy: 0.6437]


Weights = BERT 0.5, ALBERT 0.5


100%|██████████| 1227/1227 [01:32<00:00, 13.32it/s]


Test_loss: 0.5206 Test_acc: 0.8565
--- Class contradiction_0 accuracy: 0.8571]
--- Class contradiction_1 accuracy: 0.9623]
--- Class entailment_0 accuracy: 0.8509]
--- Class entailment_1 accuracy: 0.7912]
--- Class neutral_0 accuracy: 0.8528]
--- Class neutral_1 accuracy: 0.6437]


Weights = BERT 0.6, ALBERT 0.4


100%|██████████| 1227/1227 [01:32<00:00, 13.32it/s]


Test_loss: 0.5206 Test_acc: 0.8548
--- Class contradiction_0 accuracy: 0.8501]
--- Class contradiction_1 accuracy: 0.9583]
--- Class entailment_0 accuracy: 0.8515]
--- Class entailment_1 accuracy: 0.7582]
--- Class neutral_0 accuracy: 0.8534]
--- Class neutral_1 accuracy: 0.6782]


Weights = BERT 0.7000000000000001, ALBERT 0.29999999999999993


100%|██████████| 1227/1227 [01:32<00:00, 13.31it/s]


Test_loss: 0.5206 Test_acc: 0.8537
--- Class contradiction_0 accuracy: 0.8490]
--- Class contradiction_1 accuracy: 0.9524]
--- Class entailment_0 accuracy: 0.8492]
--- Class entailment_1 accuracy: 0.7582]
--- Class neutral_0 accuracy: 0.8538]
--- Class neutral_1 accuracy: 0.7011]


Weights = BERT 0.8, ALBERT 0.19999999999999996


100%|██████████| 1227/1227 [01:32<00:00, 13.31it/s]


Test_loss: 0.5206 Test_acc: 0.8500
--- Class contradiction_0 accuracy: 0.8453]
--- Class contradiction_1 accuracy: 0.9484]
--- Class entailment_0 accuracy: 0.8445]
--- Class entailment_1 accuracy: 0.7582]
--- Class neutral_0 accuracy: 0.8511]
--- Class neutral_1 accuracy: 0.7011]


Weights = BERT 0.9, ALBERT 0.09999999999999998


100%|██████████| 1227/1227 [01:32<00:00, 13.31it/s]

Test_loss: 0.5206 Test_acc: 0.8485
--- Class contradiction_0 accuracy: 0.8402]
--- Class contradiction_1 accuracy: 0.9524]
--- Class entailment_0 accuracy: 0.8421]
--- Class entailment_1 accuracy: 0.7802]
--- Class neutral_0 accuracy: 0.8514]
--- Class neutral_1 accuracy: 0.7241]


[0.5, 0.5]





In [72]:
evaluate_bert_albert_roberta(bert, roberta, albert, bert_val_loader, roberta_val_loader, albert_val_loader, ensemble_weights=[0.5,0.5,0.5])

100%|██████████| 1227/1227 [02:12<00:00,  9.24it/s]

Test_loss: 0.5864 Test_acc: 0.8783
--- Class contradiction_0 accuracy: 0.8811]
--- Class contradiction_1 accuracy: 0.9762]
--- Class entailment_0 accuracy: 0.8751]
--- Class entailment_1 accuracy: 0.8242]
--- Class neutral_0 accuracy: 0.8696]
--- Class neutral_1 accuracy: 0.7126]







0.878347304692048