In [1]:
! pip install -U transformers
! pip install datasets sentence-transformers evaluate
! pip install accelerate -U

Collecting transformers
  Obtaining dependency information for transformers from https://files.pythonhosted.org/packages/12/dd/f17b11a93a9ca27728e12512d167eb1281c151c4c6881d3ab59eb58f4127/transformers-4.35.2-py3-none-any.whl.metadata
  Downloading transformers-4.35.2-py3-none-any.whl.metadata (123 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m123.5/123.5 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.35.2-py3-none-any.whl (7.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m40.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.35.0
    Uninstalling transformers-4.35.0:
      Successfully uninstalled transformers-4.35.0
Successfully installed transformers-4.35.2
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━

In [2]:
import numpy as np
import pandas as pd
import time
import re
from tqdm import notebook
import json

import seaborn as sns
import matplotlib.pyplot as plt

import os
import random
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from transformers import AutoModel
from transformers import AutoConfig
from transformers import get_cosine_schedule_with_warmup

from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA, IncrementalPCA
from sklearn.metrics import r2_score

from datasets import Dataset
import evaluate

from sentence_transformers import SentenceTransformer, models, util

import gc

from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback, TextClassificationPipeline



In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = False
    torch.backends.cudnn.benchmark = True

In [4]:
SEED = 97
seed_everything(SEED)

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device: ', device.type)

Device:  cuda


In [6]:
def get_mnli_dataset_dfs(test_size=0.1):
    dataset = load_dataset("multi_nli")
    test_df = dataset["validation_mismatched"].to_pandas().dropna()
    test_df = test_df[['premise', 'hypothesis', 'label']]

    test_df.columns = ['sentence1', 'sentence2', 'label']

    test_df = test_df[test_df['label']!= -1].copy().reset_index(drop=True)

    return test_df

In [7]:
def get_snli_dataset_dfs(test_size=0.1):
    dataset = load_dataset("snli")

    # Access the train split of the dataset
    test_df = dataset["test"].to_pandas().dropna()

    test_df = test_df[['premise', 'hypothesis', 'label']]

    test_df.columns = ['sentence1', 'sentence2', 'label']

    test_df = test_df[test_df['label']!= -1].copy().reset_index(drop=True)

    return test_df

In [8]:
def get_anli_r1_dataset_dfs(test_size=0.1):
    dataset = load_dataset("anli")
    test_df = dataset["test_r1"].to_pandas().dropna()

    test_df = test_df[['premise', 'hypothesis', 'label']]
    test_df.columns = ['sentence1', 'sentence2', 'label']

    return test_df

In [9]:
def get_anli_r2_dataset_dfs(test_size=0.1):
    dataset = load_dataset("anli")
    test_df = dataset["test_r2"].to_pandas().dropna()

    test_df = test_df[['premise', 'hypothesis', 'label']]
    test_df.columns = ['sentence1', 'sentence2', 'label']

    return test_df

In [10]:
def get_anli_r3_dataset_dfs(test_size=0.1):
    dataset = load_dataset("anli")
    test_df = dataset["test_r3"].to_pandas().dropna()

    test_df = test_df[['premise', 'hypothesis', 'label']]
    test_df.columns = ['sentence1', 'sentence2', 'label']

    return test_df

In [11]:
def get_sick_dataset_dfs(test_size=0.1):
    dataset = load_dataset("sick")

    test_df = dataset["test"].to_pandas().dropna()

    test_df = test_df[['sentence_A', 'sentence_B', 'label']]

    test_df.columns = ['sentence1', 'sentence2', 'label']

    test_df = test_df[test_df['label']!= -1].copy().reset_index(drop=True)

    return test_df

In [12]:
def standardize_label(label):
    if label == 0:
        return 1
    elif label == 1:
        return 0
    return 2

def get_semeval_2014_taskl():
    dataset = load_dataset("sem_eval_2014_task_1")
    test_df = dataset["test"].to_pandas().dropna()

    test_df = test_df[['premise', 'hypothesis', 'entailment_judgment']]
    test_df['entailment_judgment'] = test_df['entailment_judgment'].apply(lambda x: standardize_label(x))
    test_df.columns = ['sentence1', 'sentence2', 'label']

    return test_df

In [13]:
test_dfs = {}

In [14]:
test_df = get_snli_dataset_dfs()
test_dfs['snli'] = test_df

test_df = get_mnli_dataset_dfs()
test_dfs['mnli'] = test_df

test_df = get_anli_r1_dataset_dfs()
test_dfs['anli_r1'] = test_df

test_df = get_anli_r2_dataset_dfs()
test_dfs['anli_r2'] = test_df

test_df = get_anli_r3_dataset_dfs()
test_dfs['anli_r3'] = test_df

test_df = get_sick_dataset_dfs()
test_dfs['sick'] = test_df

test_df = get_semeval_2014_taskl()
test_dfs['semeval'] = test_df

Downloading builder script:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/938 [00:00<?, ?B/s]

Downloading and preparing dataset snli/plain_text (download: 90.17 MiB, generated: 65.51 MiB, post-processed: Unknown size, total: 155.68 MiB) to /root/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b...


Downloading:   0%|          | 0.00/1.93k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

Dataset snli downloaded and prepared to /root/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading builder script:   0%|          | 0.00/1.90k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

Downloading and preparing dataset multi_nli/default (download: 216.34 MiB, generated: 410.92 MiB, post-processed: Unknown size, total: 627.27 MiB) to /root/.cache/huggingface/datasets/multi_nli/default/0.0.0/591f72eb6263d1ab527561777936b199b714cda156d35716881158a2bd144f39...


Downloading data:   0%|          | 0.00/227M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating validation_matched split:   0%|          | 0/9815 [00:00<?, ? examples/s]

Generating validation_mismatched split:   0%|          | 0/9832 [00:00<?, ? examples/s]

Dataset multi_nli downloaded and prepared to /root/.cache/huggingface/datasets/multi_nli/default/0.0.0/591f72eb6263d1ab527561777936b199b714cda156d35716881158a2bd144f39. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading builder script:   0%|          | 0.00/2.00k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

Downloading and preparing dataset anli/plain_text (download: 17.76 MiB, generated: 73.55 MiB, post-processed: Unknown size, total: 91.31 MiB) to /root/.cache/huggingface/datasets/anli/plain_text/0.1.0/aabce88453b06dff21c201855ea83283bab0390bff746deadb30b65695755c0b...


Downloading data:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

Generating train_r1 split:   0%|          | 0/16946 [00:00<?, ? examples/s]

Generating dev_r1 split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test_r1 split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating train_r2 split:   0%|          | 0/45460 [00:00<?, ? examples/s]

Generating dev_r2 split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test_r2 split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating train_r3 split:   0%|          | 0/100459 [00:00<?, ? examples/s]

Generating dev_r3 split:   0%|          | 0/1200 [00:00<?, ? examples/s]

Generating test_r3 split:   0%|          | 0/1200 [00:00<?, ? examples/s]

Dataset anli downloaded and prepared to /root/.cache/huggingface/datasets/anli/plain_text/0.1.0/aabce88453b06dff21c201855ea83283bab0390bff746deadb30b65695755c0b. Subsequent calls will reuse this data.


  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

Downloading builder script:   0%|          | 0.00/2.04k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.51k [00:00<?, ?B/s]

Downloading and preparing dataset sick/default (download: 212.48 KiB, generated: 2.50 MiB, post-processed: Unknown size, total: 2.71 MiB) to /root/.cache/huggingface/datasets/sick/default/0.0.0/c6b3b0b44eb84b134851396d6d464e5cb8f026960519d640e087fe33472626db...


Downloading data:   0%|          | 0.00/218k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4439 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/495 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/4906 [00:00<?, ? examples/s]

Dataset sick downloaded and prepared to /root/.cache/huggingface/datasets/sick/default/0.0.0/c6b3b0b44eb84b134851396d6d464e5cb8f026960519d640e087fe33472626db. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading builder script:   0%|          | 0.00/1.82k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Downloading and preparing dataset sem_eval2014_task1/default (download: 192.61 KiB, generated: 1.14 MiB, post-processed: Unknown size, total: 1.33 MiB) to /root/.cache/huggingface/datasets/sem_eval2014_task1/default/1.0.0/05e094e84ece42e036799e05c4d909ee68f6df8f82f60f9302b8ef15bb9de478...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/87.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/93.4k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/16.4k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/4500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/4927 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/500 [00:00<?, ? examples/s]

Dataset sem_eval2014_task1 downloaded and prepared to /root/.cache/huggingface/datasets/sem_eval2014_task1/default/1.0.0/05e094e84ece42e036799e05c4d909ee68f6df8f82f60f9302b8ef15bb9de478. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [15]:
DATASET_NAME = "snli"

In [16]:
SAVED_METRICS_PATH = f"/kaggle/input/snli-run-metrics/run-metrics-0.json"
SAVED_MODELS_PATH = f"/kaggle/input/snli-models/SNLI_Models_BERT"

In [17]:
MODEL_NAME = "bert-base-uncased"
MAX_LEN = 128
# TRAIN_BATCH_SIZE = 32
EVAL_BATCH_SIZE = 128
# SAVE_MODEL_PATH = "/content/drive/MyDrive/Representative Subset/Models"
# SAVE_LOGS_PATH = "/content/drive/MyDrive/Representative Subset/Logs"
# LEARNING_RATE = 5e-5
# EPOCHS = 3

In [18]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [19]:
# Funtion to tokenize data
def tokenize_dataset(data):
    return tokenizer(data["sentence1"],
                     data["sentence2"],
                     max_length=MAX_LEN,
                     truncation=True,
                     padding="max_length"
                     )

In [20]:
def get_hf_dataset(df):
    data = Dataset.from_pandas(df)
    dataset = data.map(tokenize_dataset)
    return dataset

In [21]:
# Function to compute the metric
def compute_metrics(eval_pred):
    metric = evaluate.load("accuracy")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [22]:
with open(SAVED_METRICS_PATH, 'r') as f:
    data = json.load(f)

model_sizes = data['modelling_metrics']['subset_size']
model_sizes

[54935, 109870, 164805, 219740, 274675, 329610, 384545, 439480, 494415, 549350]

In [23]:
temp_training_args = TrainingArguments(
    output_dir='./results',
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    report_to='none'
)

In [24]:
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [25]:
# evaluation_metrics = []
# for size in model_sizes:
#     print(f"Loading model from: {SAVED_MODELS_PATH}/model-{size}")
#     model = AutoModelForSequenceClassification.from_pretrained(f"{SAVED_MODELS_PATH}/model-{size}").to(device)
#     trainer = Trainer(
#         model=model,                         # the instantiated 🤗 Transformers model to be trained
#         args=temp_training_args,                  # training arguments, defined above
#         compute_metrics=compute_metrics,     # the callback that computes metrics of interest
#     )

#     # Evaluate the model
#     eval_result = trainer.evaluate(eval_dataset=anli_r3_dataset)
#     print(eval_result)
#     evaluation_metrics.append(eval_result)

#     del model
#     gc.collect()

In [26]:
def save_metrics(filePath, evaluation_metrics, model_sizes):
    metrics = {'evaluation_metrics': evaluation_metrics, 'model_sizes': model_sizes}
    with open(filePath, "w") as outfile:
        json.dump(metrics, outfile)

In [27]:
for key, test_df in test_dfs.items():
    if key != DATASET_NAME:
        TEST_DATASET_NAME = key
        print(f"Testing for dataset: {TEST_DATASET_NAME} with {len(test_df)} samples.")
        test_dataset = get_hf_dataset(test_df)
        evaluation_metrics = []
        for size in model_sizes:
            print(f"Loading model from: {SAVED_MODELS_PATH}/model-{size}")
            model = AutoModelForSequenceClassification.from_pretrained(f"{SAVED_MODELS_PATH}/model-{size}").to(device)
            trainer = Trainer(
                model=model,                         # the instantiated 🤗 Transformers model to be trained
                args=temp_training_args,                  # training arguments, defined above
                compute_metrics=compute_metrics     # the callback that computes metrics of interest
            )

            # Evaluate the model
            eval_result = trainer.evaluate(eval_dataset=test_dataset)
            print(eval_result)
            evaluation_metrics.append(eval_result)

            del model
            gc.collect()
            
        save_metrics(f"/kaggle/working/eval-metrics-{TEST_DATASET_NAME}.json", evaluation_metrics, model_sizes)

Testing for dataset: mnli with 9832 samples.


  if _pandas_api.is_sparse(col):


  0%|          | 0/9832 [00:00<?, ?ex/s]

Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-54935


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

{'eval_loss': 0.828673779964447, 'eval_accuracy': 0.6839910496338486, 'eval_runtime': 44.5633, 'eval_samples_per_second': 220.63, 'eval_steps_per_second': 1.728}
Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-109870


{'eval_loss': 0.7084160447120667, 'eval_accuracy': 0.7112489829129374, 'eval_runtime': 42.925, 'eval_samples_per_second': 229.05, 'eval_steps_per_second': 1.794}
Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-164805


{'eval_loss': 0.6946902275085449, 'eval_accuracy': 0.717860048820179, 'eval_runtime': 42.9376, 'eval_samples_per_second': 228.983, 'eval_steps_per_second': 1.793}
Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-219740


{'eval_loss': 0.7250040173530579, 'eval_accuracy': 0.7318958502847844, 'eval_runtime': 43.0864, 'eval_samples_per_second': 228.193, 'eval_steps_per_second': 1.787}
Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-274675


{'eval_loss': 0.6750668883323669, 'eval_accuracy': 0.7235557363710333, 'eval_runtime': 43.0771, 'eval_samples_per_second': 228.242, 'eval_steps_per_second': 1.787}
Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-329610


{'eval_loss': 0.6931813955307007, 'eval_accuracy': 0.7299633848657445, 'eval_runtime': 43.1001, 'eval_samples_per_second': 228.12, 'eval_steps_per_second': 1.787}
Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-384545


{'eval_loss': 0.7325683832168579, 'eval_accuracy': 0.7306753458096013, 'eval_runtime': 43.3082, 'eval_samples_per_second': 227.024, 'eval_steps_per_second': 1.778}
Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-439480


{'eval_loss': 0.6917023658752441, 'eval_accuracy': 0.7293531326281529, 'eval_runtime': 43.3238, 'eval_samples_per_second': 226.942, 'eval_steps_per_second': 1.777}
Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-494415


{'eval_loss': 0.6787758469581604, 'eval_accuracy': 0.7299633848657445, 'eval_runtime': 43.3857, 'eval_samples_per_second': 226.618, 'eval_steps_per_second': 1.775}
Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-549350


{'eval_loss': 0.6822842955589294, 'eval_accuracy': 0.7125711960943857, 'eval_runtime': 43.3286, 'eval_samples_per_second': 226.917, 'eval_steps_per_second': 1.777}
Testing for dataset: anli_r1 with 1000 samples.


  if _pandas_api.is_sparse(col):


  0%|          | 0/1000 [00:00<?, ?ex/s]

Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-54935


{'eval_loss': 1.9745346307754517, 'eval_accuracy': 0.284, 'eval_runtime': 5.182, 'eval_samples_per_second': 192.976, 'eval_steps_per_second': 1.544}
Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-109870


{'eval_loss': 1.846494197845459, 'eval_accuracy': 0.26, 'eval_runtime': 5.0412, 'eval_samples_per_second': 198.364, 'eval_steps_per_second': 1.587}
Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-164805


{'eval_loss': 1.962878704071045, 'eval_accuracy': 0.253, 'eval_runtime': 5.0188, 'eval_samples_per_second': 199.251, 'eval_steps_per_second': 1.594}
Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-219740


{'eval_loss': 2.1638147830963135, 'eval_accuracy': 0.276, 'eval_runtime': 4.9909, 'eval_samples_per_second': 200.364, 'eval_steps_per_second': 1.603}
Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-274675


{'eval_loss': 1.8621034622192383, 'eval_accuracy': 0.254, 'eval_runtime': 5.1529, 'eval_samples_per_second': 194.067, 'eval_steps_per_second': 1.553}
Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-329610


{'eval_loss': 2.0907251834869385, 'eval_accuracy': 0.259, 'eval_runtime': 4.9907, 'eval_samples_per_second': 200.372, 'eval_steps_per_second': 1.603}
Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-384545


{'eval_loss': 2.440905809402466, 'eval_accuracy': 0.23, 'eval_runtime': 4.9968, 'eval_samples_per_second': 200.127, 'eval_steps_per_second': 1.601}
Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-439480


{'eval_loss': 1.8829543590545654, 'eval_accuracy': 0.284, 'eval_runtime': 4.9953, 'eval_samples_per_second': 200.187, 'eval_steps_per_second': 1.601}
Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-494415


{'eval_loss': 1.9706252813339233, 'eval_accuracy': 0.262, 'eval_runtime': 4.9796, 'eval_samples_per_second': 200.818, 'eval_steps_per_second': 1.607}
Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-549350


{'eval_loss': 1.6738940477371216, 'eval_accuracy': 0.279, 'eval_runtime': 4.9958, 'eval_samples_per_second': 200.168, 'eval_steps_per_second': 1.601}
Testing for dataset: anli_r2 with 1000 samples.


  if _pandas_api.is_sparse(col):


  0%|          | 0/1000 [00:00<?, ?ex/s]

Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-54935


{'eval_loss': 1.9679036140441895, 'eval_accuracy': 0.306, 'eval_runtime': 5.0163, 'eval_samples_per_second': 199.348, 'eval_steps_per_second': 1.595}
Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-109870


{'eval_loss': 1.773914098739624, 'eval_accuracy': 0.29, 'eval_runtime': 5.0846, 'eval_samples_per_second': 196.673, 'eval_steps_per_second': 1.573}
Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-164805


{'eval_loss': 1.7883046865463257, 'eval_accuracy': 0.304, 'eval_runtime': 5.0367, 'eval_samples_per_second': 198.543, 'eval_steps_per_second': 1.588}
Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-219740


{'eval_loss': 2.1036202907562256, 'eval_accuracy': 0.28, 'eval_runtime': 4.9987, 'eval_samples_per_second': 200.05, 'eval_steps_per_second': 1.6}
Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-274675


{'eval_loss': 1.8197051286697388, 'eval_accuracy': 0.266, 'eval_runtime': 5.0099, 'eval_samples_per_second': 199.604, 'eval_steps_per_second': 1.597}
Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-329610


{'eval_loss': 1.9319417476654053, 'eval_accuracy': 0.31, 'eval_runtime': 5.0113, 'eval_samples_per_second': 199.55, 'eval_steps_per_second': 1.596}
Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-384545


{'eval_loss': 2.2022624015808105, 'eval_accuracy': 0.283, 'eval_runtime': 5.1743, 'eval_samples_per_second': 193.263, 'eval_steps_per_second': 1.546}
Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-439480


{'eval_loss': 1.7196292877197266, 'eval_accuracy': 0.318, 'eval_runtime': 5.1956, 'eval_samples_per_second': 192.472, 'eval_steps_per_second': 1.54}
Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-494415


{'eval_loss': 1.8730242252349854, 'eval_accuracy': 0.293, 'eval_runtime': 4.9864, 'eval_samples_per_second': 200.547, 'eval_steps_per_second': 1.604}
Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-549350


{'eval_loss': 1.5586869716644287, 'eval_accuracy': 0.303, 'eval_runtime': 5.1738, 'eval_samples_per_second': 193.282, 'eval_steps_per_second': 1.546}
Testing for dataset: anli_r3 with 1200 samples.


  if _pandas_api.is_sparse(col):


  0%|          | 0/1200 [00:00<?, ?ex/s]

Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-54935


{'eval_loss': 1.7948964834213257, 'eval_accuracy': 0.3416666666666667, 'eval_runtime': 6.2169, 'eval_samples_per_second': 193.021, 'eval_steps_per_second': 1.609}
Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-109870


{'eval_loss': 1.654415249824524, 'eval_accuracy': 0.3325, 'eval_runtime': 5.8704, 'eval_samples_per_second': 204.414, 'eval_steps_per_second': 1.703}
Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-164805


{'eval_loss': 1.639012098312378, 'eval_accuracy': 0.32083333333333336, 'eval_runtime': 6.0008, 'eval_samples_per_second': 199.973, 'eval_steps_per_second': 1.666}
Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-219740


{'eval_loss': 1.9625797271728516, 'eval_accuracy': 0.32083333333333336, 'eval_runtime': 5.8716, 'eval_samples_per_second': 204.373, 'eval_steps_per_second': 1.703}
Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-274675


{'eval_loss': 1.6349323987960815, 'eval_accuracy': 0.30666666666666664, 'eval_runtime': 5.867, 'eval_samples_per_second': 204.535, 'eval_steps_per_second': 1.704}
Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-329610


{'eval_loss': 1.8814424276351929, 'eval_accuracy': 0.3175, 'eval_runtime': 5.8686, 'eval_samples_per_second': 204.479, 'eval_steps_per_second': 1.704}
Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-384545


{'eval_loss': 2.059359550476074, 'eval_accuracy': 0.32416666666666666, 'eval_runtime': 5.8618, 'eval_samples_per_second': 204.715, 'eval_steps_per_second': 1.706}
Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-439480


{'eval_loss': 1.701537013053894, 'eval_accuracy': 0.3575, 'eval_runtime': 6.1023, 'eval_samples_per_second': 196.648, 'eval_steps_per_second': 1.639}
Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-494415


{'eval_loss': 1.756530523300171, 'eval_accuracy': 0.355, 'eval_runtime': 5.9094, 'eval_samples_per_second': 203.065, 'eval_steps_per_second': 1.692}
Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-549350


{'eval_loss': 1.512949824333191, 'eval_accuracy': 0.3416666666666667, 'eval_runtime': 5.8977, 'eval_samples_per_second': 203.47, 'eval_steps_per_second': 1.696}
Testing for dataset: sick with 4906 samples.


  if _pandas_api.is_sparse(col):


  0%|          | 0/4906 [00:00<?, ?ex/s]

Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-54935


{'eval_loss': 1.822907567024231, 'eval_accuracy': 0.5046881369751325, 'eval_runtime': 21.9991, 'eval_samples_per_second': 223.009, 'eval_steps_per_second': 1.773}
Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-109870


{'eval_loss': 1.7119848728179932, 'eval_accuracy': 0.5034651447207501, 'eval_runtime': 22.0821, 'eval_samples_per_second': 222.171, 'eval_steps_per_second': 1.766}
Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-164805


{'eval_loss': 1.4955008029937744, 'eval_accuracy': 0.5120260905014268, 'eval_runtime': 21.8974, 'eval_samples_per_second': 224.045, 'eval_steps_per_second': 1.781}
Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-219740


{'eval_loss': 1.8347867727279663, 'eval_accuracy': 0.5695067264573991, 'eval_runtime': 21.924, 'eval_samples_per_second': 223.773, 'eval_steps_per_second': 1.779}
Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-274675


{'eval_loss': 1.4383747577667236, 'eval_accuracy': 0.5719527109661638, 'eval_runtime': 21.8902, 'eval_samples_per_second': 224.118, 'eval_steps_per_second': 1.782}
Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-329610


{'eval_loss': 1.6689497232437134, 'eval_accuracy': 0.5664492458214432, 'eval_runtime': 21.9327, 'eval_samples_per_second': 223.685, 'eval_steps_per_second': 1.778}
Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-384545


{'eval_loss': 1.880039095878601, 'eval_accuracy': 0.5511618426416632, 'eval_runtime': 21.8835, 'eval_samples_per_second': 224.187, 'eval_steps_per_second': 1.782}
Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-439480


{'eval_loss': 1.6674275398254395, 'eval_accuracy': 0.5752140236445169, 'eval_runtime': 21.9361, 'eval_samples_per_second': 223.65, 'eval_steps_per_second': 1.778}
Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-494415


{'eval_loss': 1.667116403579712, 'eval_accuracy': 0.5558499796167957, 'eval_runtime': 21.9228, 'eval_samples_per_second': 223.786, 'eval_steps_per_second': 1.779}
Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-549350


{'eval_loss': 1.6363989114761353, 'eval_accuracy': 0.556461475743987, 'eval_runtime': 22.0222, 'eval_samples_per_second': 222.775, 'eval_steps_per_second': 1.771}
Testing for dataset: semeval with 4927 samples.


  if _pandas_api.is_sparse(col):


  0%|          | 0/4927 [00:00<?, ?ex/s]

Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-54935


{'eval_loss': 1.8187209367752075, 'eval_accuracy': 0.5051755632230567, 'eval_runtime': 21.9233, 'eval_samples_per_second': 224.738, 'eval_steps_per_second': 1.779}
Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-109870


{'eval_loss': 1.709068775177002, 'eval_accuracy': 0.5041607469048103, 'eval_runtime': 22.1442, 'eval_samples_per_second': 222.496, 'eval_steps_per_second': 1.761}
Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-164805


{'eval_loss': 1.4930076599121094, 'eval_accuracy': 0.5124822407144307, 'eval_runtime': 22.0005, 'eval_samples_per_second': 223.949, 'eval_steps_per_second': 1.773}
Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-219740


{'eval_loss': 1.829473853111267, 'eval_accuracy': 0.5705297341181246, 'eval_runtime': 22.1243, 'eval_samples_per_second': 222.697, 'eval_steps_per_second': 1.763}
Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-274675


{'eval_loss': 1.434635877609253, 'eval_accuracy': 0.5731682565455652, 'eval_runtime': 21.9476, 'eval_samples_per_second': 224.489, 'eval_steps_per_second': 1.777}
Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-329610


{'eval_loss': 1.6639348268508911, 'eval_accuracy': 0.5676882484270347, 'eval_runtime': 21.9633, 'eval_samples_per_second': 224.329, 'eval_steps_per_second': 1.776}
Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-384545


{'eval_loss': 1.8745654821395874, 'eval_accuracy': 0.5522630403896894, 'eval_runtime': 22.1253, 'eval_samples_per_second': 222.686, 'eval_steps_per_second': 1.763}
Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-439480


{'eval_loss': 1.6623032093048096, 'eval_accuracy': 0.5764156687639537, 'eval_runtime': 21.9815, 'eval_samples_per_second': 224.143, 'eval_steps_per_second': 1.774}
Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-494415


{'eval_loss': 1.6612308025360107, 'eval_accuracy': 0.5573371219809214, 'eval_runtime': 22.107, 'eval_samples_per_second': 222.871, 'eval_steps_per_second': 1.764}
Loading model from: /kaggle/input/snli-models/SNLI_Models_BERT/model-549350


{'eval_loss': 1.6315189599990845, 'eval_accuracy': 0.55774304850822, 'eval_runtime': 21.9273, 'eval_samples_per_second': 224.697, 'eval_steps_per_second': 1.779}
