# Fine-Tuning BERT on MultiNLI Dataset for NLI task with two labels (Entailment and Non-entailment)

Mounting Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True, timeout_ms = 0)

Mounted at /content/drive


In [None]:
base_dir = '/content/drive/My Drive/'

To be able to share the model with the community and use Trainer API, one should have an authentication token from the Hugging Face

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Imports

Installing packages

In [None]:
! pip install datasets
! pip install -U accelerate
! pip install -U transformers

Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.16.1 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6
Collecting accelerate
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K     

Importing libraries

In [None]:
import torch
import numpy as np
import os
import datetime
from copy import deepcopy

from transformers import (BertTokenizer,
                          BertForSequenceClassification,
                          Trainer,
                          TrainingArguments,
                          TrainerCallback)
from datasets import (load_dataset,
                      load_metric,
                      load_dataset_builder,
                      ClassLabel,
                      Value,
                      Features)

In [None]:
BATCH_SIZE = 32

In [None]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(DEVICE)

cuda:0


In [None]:
ds_builder = load_dataset_builder("glue", "mnli")
ds_builder.info.features

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


{'premise': Value(dtype='string', id=None),
 'hypothesis': Value(dtype='string', id=None),
 'label': ClassLabel(names=['entailment', 'neutral', 'contradiction'], id=None),
 'idx': Value(dtype='int32', id=None)}

Using only train and validation data

In [None]:
train_data, val_m_data, val_mm_data = load_dataset("glue", "mnli", split=['train', 'validation_matched', 'validation_mismatched'])

print("Train dataset:\n", train_data)
print("Validation matched dataset:\n", val_m_data)
print("Validation mismatched dataset:\n", val_mm_data)

Train dataset:
 Dataset({
    features: ['premise', 'hypothesis', 'label', 'idx'],
    num_rows: 392702
})
Validation matched dataset:
 Dataset({
    features: ['premise', 'hypothesis', 'label', 'idx'],
    num_rows: 9815
})
Validation mismatched dataset:
 Dataset({
    features: ['premise', 'hypothesis', 'label', 'idx'],
    num_rows: 9832
})


In [None]:
train_data.features

{'premise': Value(dtype='string', id=None),
 'hypothesis': Value(dtype='string', id=None),
 'label': ClassLabel(names=['entailment', 'neutral', 'contradiction'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [None]:
# Example
print(train_data[0])

{'premise': 'Conceptually cream skimming has two basic dimensions - product and geography.', 'hypothesis': 'Product and geography are what make cream skimming work. ', 'label': 1, 'idx': 0}


In [None]:
metric = load_metric('glue', "mnli")

  metric = load_metric('glue', "mnli")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

In [None]:
metric

Metric(name: "glue", features: {'predictions': Value(dtype='int64', id=None), 'references': Value(dtype='int64', id=None)}, usage: """
Compute GLUE evaluation metric associated to each GLUE dataset.
Args:
    predictions: list of predictions to score.
        Each translation should be tokenized into a list of tokens.
    references: list of lists of references for each translation.
        Each reference should be tokenized into a list of tokens.
Returns: depending on the GLUE subset, one or several of:
    "accuracy": Accuracy
    "f1": F1 score
    "pearson": Pearson Correlation
    "spearmanr": Spearman Correlation
    "matthews_correlation": Matthew Correlation
Examples:

    >>> glue_metric = datasets.load_metric('glue', 'sst2')  # 'sst2' or any of ["mnli", "mnli_mismatched", "mnli_matched", "qnli", "rte", "wnli", "hans"]
    >>> references = [0, 1]
    >>> predictions = [0, 1]
    >>> results = glue_metric.compute(predictions=predictions, references=references)
    >>> print(res

## Preprocessing the data

Replacing neutral (1) and contradiction (2) labels with 1 (non-entailment).

In [None]:
new_features = train_data.features.copy()
new_features['label'] = ClassLabel(num_classes = 2, names=["entailment", "non-entailment"])

train_data = train_data.cast(new_features)
val_m_data = val_m_data.cast(new_features)
val_mm_data = val_mm_data.cast(new_features)

print(train_data.features)

Casting the dataset:   0%|          | 0/392702 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/9815 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/9832 [00:00<?, ? examples/s]

{'premise': Value(dtype='string', id=None), 'hypothesis': Value(dtype='string', id=None), 'label': ClassLabel(names=['entailment', 'non-entailment'], id=None), 'idx': Value(dtype='int32', id=None)}


In [None]:
def modify_label(example):
    example['label'] = 0 if example['label'] == 0 else 1
    return example

train_data = train_data.map(modify_label)
val_m_data = val_m_data.map(modify_label)
val_mm_data = val_mm_data.map(modify_label)

Map:   0%|          | 0/392702 [00:00<?, ? examples/s]

Map:   0%|          | 0/9815 [00:00<?, ? examples/s]

Map:   0%|          | 0/9832 [00:00<?, ? examples/s]

## Tokenizing the dataset


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
tokenizer("Hello, this one sentence!", "And this sentence goes with it.", padding="max_length", max_length=32)

{'input_ids': [101, 7592, 1010, 2023, 2028, 6251, 999, 102, 1998, 2023, 6251, 3632, 2007, 2009, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["premise"], examples["hypothesis"], padding="max_length", truncation=True)

tokenized_train_data = train_data.map(tokenize_function, batched=True)
tokenized_val_m_data = val_m_data.map(tokenize_function, batched=True)
tokenized_val_mm_data = val_mm_data.map(tokenize_function, batched=True)

Map:   0%|          | 0/392702 [00:00<?, ? examples/s]

Map:   0%|          | 0/9815 [00:00<?, ? examples/s]

Map:   0%|          | 0/9832 [00:00<?, ? examples/s]

In [None]:
example = tokenized_train_data[0]
print(example.keys())

dict_keys(['premise', 'hypothesis', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'])


In [None]:
tokenizer.decode(example['input_ids'])

'[CLS] conceptually cream skimming has two basic dimensions - product and geography. [SEP] product and geography are what make cream skimming work. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] 

In [None]:
example['label']

1

In [None]:
small_train_dataset = tokenized_train_data.shuffle(seed=42).select(range(100000))
small_val_dataset = tokenized_val_m_data.shuffle(seed=42).select(range(1000))

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.to(DEVICE)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
model_folder = base_dir + "Models-BERT-" + str(datetime.datetime.now().timestamp())

if os.path.exists(model_folder) == False:
  os.mkdir(model_folder)


Trainer with arguments for saving and evaluating every epoch

In [None]:
metric_name = "accuracy"

args = TrainingArguments(
    output_dir = model_folder,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    resume_from_checkpoint=True,
    push_to_hub=True,
)

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

Callback to add calcualtion of train accuracy (it is time-consuming though)

In [None]:
class CustomCallback(TrainerCallback):

    def __init__(self, trainer) -> None:
        super().__init__()
        self._trainer = trainer

    def on_epoch_end(self, args, state, control, **kwargs):
        if control.should_evaluate:
            control_copy = deepcopy(control)
            self._trainer.evaluate(eval_dataset=self._trainer.train_dataset, metric_key_prefix="train")
            return control_copy

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_train_data,
    eval_dataset=small_val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

#trainer.add_callback(CustomCallback(trainer))

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2639,0.255064,0.898


FailedPreconditionError: ignored

In [None]:
#trainer.train(resume_from_checkpoint=True)

Trainer with arguments for saving and evaluating every 5000 steps

In [None]:
new_args = TrainingArguments(
    output_dir = model_folder,
    evaluation_strategy = "steps",
    eval_steps=5000,
    save_strategy = "steps",
    save_steps=5000,
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=2,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    resume_from_checkpoint=True,
    push_to_hub=True,
)

new_trainer = Trainer(
    model,
    new_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
new_trainer.train()

Step,Training Loss,Validation Loss,Accuracy
5000,0.1368,0.370258,0.886
10000,0.1172,0.432073,0.894
15000,0.1315,0.373415,0.895
20000,0.1281,0.383873,0.893


Step,Training Loss,Validation Loss,Accuracy
5000,0.1368,0.370258,0.886
10000,0.1172,0.432073,0.894
15000,0.1315,0.373415,0.895
20000,0.1281,0.383873,0.893


TrainOutput(global_step=24544, training_loss=0.1150286255751626, metrics={'train_runtime': 17388.4137, 'train_samples_per_second': 45.168, 'train_steps_per_second': 1.412, 'total_flos': 2.0664847532396544e+17, 'train_loss': 0.1150286255751626, 'epoch': 2.0})

In [None]:
new_trainer.state.log_history

[{'loss': 0.0551,
  'learning_rate': 1.9592568448500654e-05,
  'epoch': 0.04,
  'step': 500},
 {'loss': 0.0393,
  'learning_rate': 1.9185136897001307e-05,
  'epoch': 0.08,
  'step': 1000},
 {'loss': 0.0414,
  'learning_rate': 1.8777705345501956e-05,
  'epoch': 0.12,
  'step': 1500},
 {'loss': 0.041,
  'learning_rate': 1.837027379400261e-05,
  'epoch': 0.16,
  'step': 2000},
 {'loss': 0.0496,
  'learning_rate': 1.796284224250326e-05,
  'epoch': 0.2,
  'step': 2500},
 {'loss': 0.0517,
  'learning_rate': 1.7555410691003914e-05,
  'epoch': 0.24,
  'step': 3000},
 {'loss': 0.0448,
  'learning_rate': 1.7147979139504566e-05,
  'epoch': 0.29,
  'step': 3500},
 {'loss': 0.0487,
  'learning_rate': 1.6740547588005215e-05,
  'epoch': 0.33,
  'step': 4000},
 {'loss': 0.1443,
  'learning_rate': 1.6333116036505868e-05,
  'epoch': 0.37,
  'step': 4500},
 {'loss': 0.1368,
  'learning_rate': 1.592568448500652e-05,
  'epoch': 0.41,
  'step': 5000},
 {'eval_loss': 0.3702576756477356,
  'eval_accuracy': 0.

In [None]:
new_trainer.evaluate()

{'eval_loss': 0.37341487407684326,
 'eval_accuracy': 0.895,
 'eval_runtime': 8.1626,
 'eval_samples_per_second': 122.509,
 'eval_steps_per_second': 7.718,
 'epoch': 2.0}

Saving the best model based on evaluation accuracy

In [None]:
#best_model_folder = base_dir + "BERT-best"
#trainer.save_model(best_model_folder)

## Evaluate

Evaluating the loaded model

In [None]:
loaded_model = BertForSequenceClassification.from_pretrained(base_dir + "Models-BERT-1704133292.251167/checkpoint-4000/")

In [None]:
loaded_model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
args_test = TrainingArguments(base_dir + "Models-BERT-1704137471.16337/",
         per_device_eval_batch_size=BATCH_SIZE)

eval_trainer = Trainer(
    model=loaded_model,
    args=args_test,
    train_dataset=small_train_dataset,
    eval_dataset= small_val_dataset, #small_test_dataset,
    compute_metrics=compute_metrics)

eval_trainer.evaluate()

{'eval_loss': 0.35467514395713806,
 'eval_accuracy': 0.8916963830871115,
 'eval_runtime': 80.2217,
 'eval_samples_per_second': 122.348,
 'eval_steps_per_second': 7.654}

I evaluated the models I saved during training both on the validation matched and validation mismatched datasets

In [None]:
args_test = TrainingArguments(base_dir + "Models-BERT-1704137471.16337/",
         per_device_eval_batch_size=BATCH_SIZE)

In [None]:
path = base_dir + "Models-BERT-1704137471.16337/"
for i in [5000, 10000, 15000, 20000]:
    print(f"Checkpoint {i}:\n")
    loaded_model = BertForSequenceClassification.from_pretrained(path + f"checkpoint-{i}/")
    loaded_model.eval()

    eval_trainer = Trainer(model=loaded_model,
                          args=args_test,
                          train_dataset=small_train_dataset,
                          eval_dataset=tokenized_val_m_data,
                          compute_metrics=compute_metrics)

    eval_res = eval_trainer.evaluate()
    print(eval_res)




Checkpoint 5000:



{'eval_loss': 0.3546751141548157, 'eval_accuracy': 0.8916963830871115, 'eval_runtime': 320.3336, 'eval_samples_per_second': 30.64, 'eval_steps_per_second': 0.958}
Checkpoint 10000:



{'eval_loss': 0.3989519476890564, 'eval_accuracy': 0.9001528273051452, 'eval_runtime': 322.3407, 'eval_samples_per_second': 30.449, 'eval_steps_per_second': 0.952}
Checkpoint 15000:



{'eval_loss': 0.36958107352256775, 'eval_accuracy': 0.8984207845134997, 'eval_runtime': 322.0197, 'eval_samples_per_second': 30.48, 'eval_steps_per_second': 0.953}
Checkpoint 20000:



{'eval_loss': 0.3496173918247223, 'eval_accuracy': 0.9006622516556292, 'eval_runtime': 321.7231, 'eval_samples_per_second': 30.508, 'eval_steps_per_second': 0.954}


In [None]:
for i in [5000, 10000, 15000, 20000]:
    print(f"Checkpoint {i}:\n")
    loaded_model = BertForSequenceClassification.from_pretrained(path + f"checkpoint-{i}/")
    loaded_model.eval()

    eval_trainer = Trainer(model=loaded_model,
                          args=args_test,
                          train_dataset=small_train_dataset,
                          eval_dataset=tokenized_val_mm_data,
                          compute_metrics=compute_metrics)

    eval_res = eval_trainer.evaluate()
    print(eval_res)

Checkpoint 5000:



{'eval_loss': 0.3495672047138214, 'eval_accuracy': 0.8898494711147275, 'eval_runtime': 320.9468, 'eval_samples_per_second': 30.634, 'eval_steps_per_second': 0.96}
Checkpoint 10000:



{'eval_loss': 0.3938233554363251, 'eval_accuracy': 0.8997152156224573, 'eval_runtime': 321.343, 'eval_samples_per_second': 30.597, 'eval_steps_per_second': 0.958}
Checkpoint 15000:



{'eval_loss': 0.35519373416900635, 'eval_accuracy': 0.8966639544344996, 'eval_runtime': 321.177, 'eval_samples_per_second': 30.612, 'eval_steps_per_second': 0.959}
Checkpoint 20000:



{'eval_loss': 0.3354913294315338, 'eval_accuracy': 0.9008340113913751, 'eval_runtime': 321.9446, 'eval_samples_per_second': 30.539, 'eval_steps_per_second': 0.957}


In [None]:
best_path = base_dir + "Models-BERT-1704137471.16337/" + "checkpoint-20000/"
best_model_loaded = BertForSequenceClassification.from_pretrained(best_path)
best_model_folder = base_dir + "BERT-best"
best_model_loaded.save_pretrained(best_model_folder)

Sharing the model with the Hugging Face community

In [None]:
best_model_loaded.push_to_hub("bert-base-uncased-mnli-2-labels")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/an-eve/bert-base-uncased-mnli-2-labels/commit/58808d9f546d418580d41cd64ff037bb0b7f974d', commit_message='Upload BertForSequenceClassification', commit_description='', oid='58808d9f546d418580d41cd64ff037bb0b7f974d', pr_url=None, pr_revision=None, pr_num=None)

My best model's performance:

```
MNLI: 90.07%
MNLI-mm: 90.08%
```