In [None]:
!pip install transformers
!pip install datasets
!pip install evaluate

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [None]:
!pip install jsonlines


Collecting jsonlines
  Downloading jsonlines-4.0.0-py3-none-any.whl.metadata (1.6 kB)
Downloading jsonlines-4.0.0-py3-none-any.whl (8.7 kB)
Installing collected packages: jsonlines
Successfully installed jsonlines-4.0.0


In [None]:
from typing import Optional, Union
from datasets import load_dataset
from dataclasses import dataclass
import evaluate
import torch
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForMultipleChoice, get_scheduler, AutoConfig, AutoModel
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from tqdm import tqdm
import argparse

import numpy as np
import scipy as sp

import torch.nn as nn
import torch.nn.functional as F
import argparse
import json
import os
import sys
import random
import pickle



In [None]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

SEED = 595
set_seed(595)


In [None]:
from google.colab import drive

drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
GOOGLE_DRIVE_PATH_AFTER_MYDRIVE = 'Colab Notebooks/EECS595/Project/Verifiable-Coherent-NLU-main'

In [None]:
DRIVE_PATH = os.path.join("drive", "My Drive", GOOGLE_DRIVE_PATH_AFTER_MYDRIVE)
sys.path.append(DRIVE_PATH)
print(os.listdir(DRIVE_PATH))

['README.md', 'requirements.txt', 'www', 'all_data', 'cache', 'saved_models', 'Verifiable-Coherent-NLU.ipynb', 'withBRET_CE.ipynb', 'project_fineTunePIQA.ipynb', 'BERT_CE.ipynb']


In [None]:
import xml.etree.ElementTree as ET
import pickle
cache_train = os.path.join(DRIVE_PATH, 'all_data/ConvEnt/ConvEnt_train_resplit.json')
cache_dev = os.path.join(DRIVE_PATH,'all_data/ConvEnt/ConvEnt_dev_resplit.json')
cache_test = os.path.join(DRIVE_PATH,'all_data/ConvEnt/ConvEnt_test_resplit.json')
ConvEnt_train = json.load(open(cache_train))
ConvEnt_dev = json.load(open(cache_dev))
ConvEnt_test = json.load(open(cache_test))

# Combine train and dev and do cross-validation
cache_folds = os.path.join(DRIVE_PATH,'all_data/ConvEnt/ConvEnt_folds.pkl') # Folds used for results presented in paper
ConvEnt_train = ConvEnt_train + ConvEnt_dev
train_sources = list(set([ex['dialog_source'] for ex in ConvEnt_train]))
print("Reserved %s dialog sources for training and validation." % len(train_sources))

no_folds = 8
if not os.path.exists(cache_folds):
  folds = []
  for k in range(no_folds):
    folds.append(np.random.choice(train_sources, size=5, replace=False))
    train_sources = [s for s in train_sources if s not in folds[-1]]
  assert len(train_sources) == 0
  print(folds)
  pickle.dump(folds, open(cache_folds, 'wb'))
else:
  folds = pickle.load(open(cache_folds, 'rb'))

Reserved 40 dialog sources for training and validation.


In [None]:
mode = "bert"
task_name = "ce"

In [None]:
if task_name in ['trip', 'ce']:
  multiple_choice = False
elif task_name == 'art':
  multiple_choice = True
else:
  raise ValueError("Task name should be set to 'trip', 'ce', or 'art' in the first cell of the notebook!")

if mode == 'bert':
  model_name = 'bert-large-uncased'
elif mode == 'roberta':
  model_name = 'roberta-large'
elif mode == 'roberta_mnli':
  model_name = 'roberta-large-mnli'
elif mode == 'deberta':
  model_name = 'microsoft/deberta-base'
elif mode == 'deberta_large':
  model_name = 'microsoft/deberta-large'

In [None]:
from transformers import BertTokenizer, RobertaTokenizer, DebertaTokenizer, AlbertTokenizer, T5Tokenizer, GPT2Tokenizer

#from DeBERTa import deberta
if mode in ['bert']:
  tokenizer_class = BertTokenizer
elif mode in ['roberta', 'roberta_mnli']:
  tokenizer_class = RobertaTokenizer
elif mode in ['deberta', 'deberta_large']:
  tokenizer_class = DebertaTokenizer

tokenizer = tokenizer_class.from_pretrained(model_name,
                                                do_lower_case = False,
                                                cache_dir=os.path.join(DRIVE_PATH, 'cache'))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
type(ConvEnt_train)

list

In [None]:
ConvEnt_train[0]

{'id': 15,
 'dialog_source': 'SW2020',
 'label': 1,
 'type': 'belief',
 'turns': [{'speaker': 'B',
   'text': 'Hi, um, okay what, now, uh, what particularly, particularly what kind of music do you like?'},
  {'speaker': 'A',
   'text': "Well, I mostly listen to popular music.  I, uh, listen to it all the time in, in my car, so, I, I tend to be one of those people who switches stations a lot because I don't like commercials.  But,"},
  {'speaker': 'B', 'text': 'Yeah.'},
  {'speaker': 'A',
   'text': "uh, I find myself listening to popular music, and, uh, quite honestly, I, I have some little children and I, unfortunately, found myself listening to a lot of nursery rhyme music here lately, but that's not by my choice."}],
 'hypothesis': 'SpeakerA likes popular music '}

In [None]:
ConvEnt_test[0]

{'id': 73,
 'dialog_source': 'SW2041',
 'label': 0,
 'type': 'fact',
 'turns': [{'speaker': 'B',
   'text': "He's an aerobics instructor.  And, um, is going to be, uh, entering [breathing] North Texas [breathing] for uh, a Kinesiology program there."},
  {'speaker': 'A', 'text': 'Uh-huh.'},
  {'speaker': 'B',
   'text': 'And, um, [breathing] the, the how I met him was through, uh, the aerobics class that he used to teach.'},
  {'speaker': 'A', 'text': "Uh-huh. You're a student?"}],
 'hypothesis': 'SpeakerA is a student of her husband'}

In [None]:
ConvEnt_dev[0]

{'id': 169,
 'dialog_source': 'SW2067',
 'label': 0,
 'type': 'belief',
 'turns': [{'speaker': 'B',
   'text': "Um, I don't know if I'm really afraid of spending too much.  I just, uh, don't think that I need them, you know.  I, uh, they are tempting at times,"},
  {'speaker': 'A', 'text': '[Laughter].'},
  {'speaker': 'B',
   'text': "but I, I just, you know, sometimes I just don't like everybody knowing everything about me, you know, so,"}],
 'hypothesis': "SpeakerA doesn't like to use a credit card"}

In [None]:
print('train examples:', len(ConvEnt_train))
print('dev examples:', len(ConvEnt_dev))
print('test examples:', len(ConvEnt_test))

train examples: 703
dev examples: 110
test examples: 172


In [None]:
debug = False

In [None]:
from www.dataset.featurize import add_bert_features_ConvEnt, get_tensor_dataset
import pickle
seq_length = 128

ConvEnt_train = add_bert_features_ConvEnt(ConvEnt_train, tokenizer, seq_length, add_segment_ids=True)
ConvEnt_dev = add_bert_features_ConvEnt(ConvEnt_dev, tokenizer, seq_length, add_segment_ids=True)
ConvEnt_test = add_bert_features_ConvEnt(ConvEnt_test, tokenizer, seq_length, add_segment_ids=True)

ConvEnt_train_folds = [[] for _ in range(no_folds)]
ConvEnt_dev_folds = [[] for _ in range(no_folds)]
for k in range(no_folds):
  ConvEnt_train_folds[k] = [ex for ex in ConvEnt_train if ex['dialog_source'] not in folds[k]]
  ConvEnt_dev_folds[k] = [ex for ex in ConvEnt_train if ex['dialog_source'] in folds[k]]

  if debug:
    ConvEnt_train_folds[k] = ConvEnt_train_folds[k][:10]
    ConvEnt_dev_folds[k] = ConvEnt_dev_folds[k][:10]

if debug:
  ConvEnt_train = ConvEnt_train[:10]
  ConvEnt_dev = ConvEnt_dev[:10]
  ConvEnt_test = ConvEnt_test[:10]

ConvEnt_train_tensor = get_tensor_dataset(ConvEnt_train, label_key='label', add_segment_ids=True)
ConvEnt_test_tensor = get_tensor_dataset(ConvEnt_test, label_key='label', add_segment_ids=True)

# Training sets for each validation fold
ConvEnt_train_folds_tensor = [get_tensor_dataset(ConvEnt_train_folds[k], label_key='label', add_segment_ids=True) for k in range(no_folds)]
ConvEnt_dev_folds_tensor = [get_tensor_dataset(ConvEnt_dev_folds[k], label_key='label', add_segment_ids=True) for k in range(no_folds)]

NameError: name 'tokenizer' is not defined

In [None]:
from transformers import BertForSequenceClassification, RobertaForSequenceClassification, DebertaForSequenceClassification, AlbertForSequenceClassification, AdamW
from transformers import BertForMultipleChoice, RobertaForMultipleChoice, AlbertForMultipleChoice, DebertaModel
from transformers import BertModel, RobertaModel, AlbertModel, DebertaModel, T5Model, T5EncoderModel, GPT2Model
from transformers import RobertaForMaskedLM
from transformers import BertConfig, RobertaConfig, DebertaConfig, AlbertConfig, T5Config, GPT2Config
#from www.model.transformers_ext import DebertaForMultipleChoice
from torch.optim import Adam
if not multiple_choice:
  if mode == 'bert':
    model_class = BertForSequenceClassification
    config_class = BertConfig
    emb_class = BertModel
  elif mode in ['roberta', 'roberta_mnli']:
    model_class = RobertaForSequenceClassification
    config_class = RobertaConfig
    emb_class = RobertaModel
    lm_class = RobertaForMaskedLM
  elif mode in ['deberta', 'deberta_large']:
    model_class = DebertaForSequenceClassification
    config_class = DebertaConfig
    emb_class = DebertaModel
else:
  if mode == 'bert':
    model_class = BertForMultipleChoice
    config_class = BertConfig
    emb_class = BertModel
  elif mode in ['roberta', 'roberta_mnli']:
    model_class = RobertaForMultipleChoice
    config_class = RobertaConfig
    emb_class = RobertaModel
    lm_class = RobertaForMaskedLM
  elif mode in ['deberta', 'deberta_large']:
    model_class = DebertaForMultipleChoice
    config_class = DebertaConfig
    emb_class = DebertaModel

In [None]:
ConvEnt_train[0]['input_ids']

KeyError: 'input_ids'

In [None]:
config_batch_size = 1
config_lr = 1e-5 # Selected learning rate for best RoBERTa-based model in TRIP paper
config_epochs = 10

In [None]:
batch_sizes = [config_batch_size]
learning_rates = [config_lr]
epochs = config_epochs
eval_batch_size = 128

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import get_linear_schedule_with_warmup
from www.model.train import train_epoch
from www.model.eval import evaluate, save_results, save_preds
from sklearn.metrics import accuracy_score
from www.utils import print_dict, get_model_dir
from collections import Counter

seed_val = 22 # Save random seed for reproducibility
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

assert len(batch_sizes) == 1
train_fold_sampler = [RandomSampler(f) for f in ConvEnt_train_folds_tensor]
train_fold_dataloader = [DataLoader(f, sampler=train_fold_sampler[i], batch_size=batch_sizes[0]) for i, f in enumerate(ConvEnt_train_folds_tensor)]

dev_fold_sampler = [SequentialSampler(f) for f in ConvEnt_dev_folds_tensor]
dev_fold_dataloader = [DataLoader(f, sampler=dev_fold_sampler[i], batch_size=eval_batch_size) for i, f in enumerate(ConvEnt_dev_folds_tensor)]

all_val_accs = Counter()
print('Beginning grid search for ConvEnt over %s parameter combination(s)!' % (str(len(batch_sizes) * len(learning_rates))))
for bs in batch_sizes:
  for lr in learning_rates:
    print('\nTRAINING MODEL: bs=%s, lr=%s' % (str(bs), str(lr)))

    for k in range(no_folds):
      print('Beginning fold %s/%s...' % (str(k+1), str(no_folds)))

      # Set up model
      if 'mnli' not in mode:
        model = model_class.from_pretrained(model_name,
                                            cache_dir=os.path.join(DRIVE_PATH, 'cache'))
      else:
        config = config_class.from_pretrained(model_name.replace('-mnli',''),
                                        num_labels=3,
                                        cache_dir=os.path.join(DRIVE_PATH, 'cache'))
        model = model_class.from_pretrained(model_name,
                                            config=config,
                                            cache_dir=os.path.join(DRIVE_PATH, 'cache'))
        config.num_labels = 2
        model.num_labels = 2
        model.classifier = cls_head_class(config=config) # Need to bring in a classification head for only 2 labels

      model.cuda()
      device = model.device

      # Set up optimizer
      optimizer = AdamW(model.parameters(), lr=lr)
      total_steps = len(train_fold_dataloader[k]) * epochs
      scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps = total_steps)

      for epoch in range(epochs):
        # Train the model for one epoch
        print('[%s] Beginning epoch...' % str(epoch))

        epoch_loss, _ = train_epoch(model, optimizer, train_fold_dataloader[k], device, seg_mode=True if 'roberta' not in mode else False)

        # Validate on dev set
        results, _, _ = evaluate(model, dev_fold_dataloader[k], device, [(accuracy_score, 'accuracy')], seg_mode=True if 'roberta' not in mode else False)
        print('[%s] Validation results:' % str(epoch))
        print_dict(results)

        # Save accuracy
        acc = results['accuracy']
        if (bs, lr, epoch) in all_val_accs:
          all_val_accs[(bs, lr, epoch)] += acc
        else:
          all_val_accs[(bs, lr, epoch)] = acc

      model.cpu()
      del model
      del optimizer
      del results
      del scheduler
      del total_steps

      print('[%s] Finished epoch.' % str(epoch))

for k in all_val_accs:
  all_val_accs[k] /= no_folds

print('Top performing param combos:')
print(all_val_accs.most_common(5))

save_fname = os.path.join(DRIVE_PATH, 'saved_models/%s_ConvEnt_xval_%s.pkl' % (model_name.replace('/','-'), '_'.join([str(lr) for lr in learning_rates])))
pickle.dump(all_val_accs, open(save_fname, 'wb'))

Beginning grid search for ConvEnt over 1 parameter combination(s)!

TRAINING MODEL: bs=1, lr=1e-05
Beginning fold 1/8...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[0] Beginning epoch...
	Beginning evaluation...
		Running prediction...
		Computing metrics...
	Finished evaluation in 0:00:00s.
[0] Validation results:
{
  accuracy: 
    0.5416666666666666,
}


[1] Beginning epoch...
	Beginning evaluation...
		Running prediction...
		Computing metrics...
	Finished evaluation in 0:00:00s.
[1] Validation results:
{
  accuracy: 
    0.5416666666666666,
}


[2] Beginning epoch...
	Beginning evaluation...
		Running prediction...
		Computing metrics...
	Finished evaluation in 0:00:00s.
[2] Validation results:
{
  accuracy: 
    0.5416666666666666,
}


[3] Beginning epoch...
	Beginning evaluation...
		Running prediction...
		Computing metrics...
	Finished evaluation in 0:00:00s.
[3] Validation results:
{
  accuracy: 
    0.5416666666666666,
}


[4] Beginning epoch...
	Beginning evaluation...
		Running prediction...
		Computing metrics...
	Finished evaluation in 0:00:00s.
[4] Validation results:
{
  accuracy: 
    0.5416666666666666,
}


[5] Beginning epoch.

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[0] Beginning epoch...
	Beginning evaluation...
		Running prediction...
		Computing metrics...
	Finished evaluation in 0:00:00s.
[0] Validation results:
{
  accuracy: 
    0.4603174603174603,
}


[1] Beginning epoch...
	Beginning evaluation...
		Running prediction...
		Computing metrics...
	Finished evaluation in 0:00:00s.
[1] Validation results:
{
  accuracy: 
    0.4603174603174603,
}


[2] Beginning epoch...
	Beginning evaluation...
		Running prediction...
		Computing metrics...
	Finished evaluation in 0:00:00s.
[2] Validation results:
{
  accuracy: 
    0.4603174603174603,
}


[3] Beginning epoch...
	Beginning evaluation...
		Running prediction...
		Computing metrics...
	Finished evaluation in 0:00:00s.
[3] Validation results:
{
  accuracy: 
    0.4603174603174603,
}


[4] Beginning epoch...
	Beginning evaluation...
		Running prediction...
		Computing metrics...
	Finished evaluation in 0:00:00s.
[4] Validation results:
{
  accuracy: 
    0.4603174603174603,
}


[5] Beginning epoch.

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[0] Beginning epoch...
	Beginning evaluation...
		Running prediction...
		Computing metrics...
	Finished evaluation in 0:00:00s.
[0] Validation results:
{
  accuracy: 
    0.504950495049505,
}


[1] Beginning epoch...
	Beginning evaluation...
		Running prediction...
		Computing metrics...
	Finished evaluation in 0:00:00s.
[1] Validation results:
{
  accuracy: 
    0.49504950495049505,
}


[2] Beginning epoch...
	Beginning evaluation...
		Running prediction...
		Computing metrics...
	Finished evaluation in 0:00:00s.
[2] Validation results:
{
  accuracy: 
    0.49504950495049505,
}


[3] Beginning epoch...
	Beginning evaluation...
		Running prediction...
		Computing metrics...
	Finished evaluation in 0:00:00s.
[3] Validation results:
{
  accuracy: 
    0.49504950495049505,
}


[4] Beginning epoch...
	Beginning evaluation...
		Running prediction...
		Computing metrics...
	Finished evaluation in 0:00:00s.
[4] Validation results:
{
  accuracy: 
    0.49504950495049505,
}


[5] Beginning epo

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[0] Beginning epoch...
	Beginning evaluation...
		Running prediction...
		Computing metrics...
	Finished evaluation in 0:00:00s.
[0] Validation results:
{
  accuracy: 
    0.5641025641025641,
}


[1] Beginning epoch...
	Beginning evaluation...
		Running prediction...
		Computing metrics...
	Finished evaluation in 0:00:00s.
[1] Validation results:
{
  accuracy: 
    0.5641025641025641,
}


[2] Beginning epoch...
	Beginning evaluation...
		Running prediction...
		Computing metrics...
	Finished evaluation in 0:00:00s.
[2] Validation results:
{
  accuracy: 
    0.5641025641025641,
}


[3] Beginning epoch...
	Beginning evaluation...
		Running prediction...
		Computing metrics...
	Finished evaluation in 0:00:00s.
[3] Validation results:
{
  accuracy: 
    0.5641025641025641,
}


[4] Beginning epoch...
	Beginning evaluation...
		Running prediction...
		Computing metrics...
	Finished evaluation in 0:00:00s.
[4] Validation results:
{
  accuracy: 
    0.5641025641025641,
}


[5] Beginning epoch.

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[0] Beginning epoch...
	Beginning evaluation...
		Running prediction...
		Computing metrics...
	Finished evaluation in 0:00:00s.
[0] Validation results:
{
  accuracy: 
    0.5180722891566265,
}


[1] Beginning epoch...
	Beginning evaluation...
		Running prediction...
		Computing metrics...
	Finished evaluation in 0:00:00s.
[1] Validation results:
{
  accuracy: 
    0.4819277108433735,
}


[2] Beginning epoch...
	Beginning evaluation...
		Running prediction...
		Computing metrics...
	Finished evaluation in 0:00:00s.
[2] Validation results:
{
  accuracy: 
    0.4819277108433735,
}


[3] Beginning epoch...
	Beginning evaluation...
		Running prediction...
		Computing metrics...
	Finished evaluation in 0:00:00s.
[3] Validation results:
{
  accuracy: 
    0.4819277108433735,
}


[4] Beginning epoch...
	Beginning evaluation...
		Running prediction...
		Computing metrics...
	Finished evaluation in 0:00:00s.
[4] Validation results:
{
  accuracy: 
    0.4819277108433735,
}


[5] Beginning epoch.

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[0] Beginning epoch...
	Beginning evaluation...
		Running prediction...
		Computing metrics...
	Finished evaluation in 0:00:00s.
[0] Validation results:
{
  accuracy: 
    0.494949494949495,
}


[1] Beginning epoch...
	Beginning evaluation...
		Running prediction...
		Computing metrics...
	Finished evaluation in 0:00:00s.
[1] Validation results:
{
  accuracy: 
    0.6262626262626263,
}


[2] Beginning epoch...
	Beginning evaluation...
		Running prediction...
		Computing metrics...
	Finished evaluation in 0:00:00s.
[2] Validation results:
{
  accuracy: 
    0.6262626262626263,
}


[3] Beginning epoch...
	Beginning evaluation...
		Running prediction...
		Computing metrics...
	Finished evaluation in 0:00:00s.
[3] Validation results:
{
  accuracy: 
    0.6262626262626263,
}


[4] Beginning epoch...
	Beginning evaluation...
		Running prediction...
		Computing metrics...
	Finished evaluation in 0:00:00s.
[4] Validation results:
{
  accuracy: 
    0.6262626262626263,
}


[5] Beginning epoch..

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[0] Beginning epoch...
	Beginning evaluation...
		Running prediction...
		Computing metrics...
	Finished evaluation in 0:00:00s.
[0] Validation results:
{
  accuracy: 
    0.5068493150684932,
}


[1] Beginning epoch...
	Beginning evaluation...
		Running prediction...
		Computing metrics...
	Finished evaluation in 0:00:00s.
[1] Validation results:
{
  accuracy: 
    0.4931506849315068,
}


[2] Beginning epoch...
	Beginning evaluation...
		Running prediction...
		Computing metrics...
	Finished evaluation in 0:00:00s.
[2] Validation results:
{
  accuracy: 
    0.5068493150684932,
}


[3] Beginning epoch...
	Beginning evaluation...
		Running prediction...
		Computing metrics...
	Finished evaluation in 0:00:00s.
[3] Validation results:
{
  accuracy: 
    0.5068493150684932,
}


[4] Beginning epoch...
	Beginning evaluation...
		Running prediction...
		Computing metrics...
	Finished evaluation in 0:00:00s.
[4] Validation results:
{
  accuracy: 
    0.5068493150684932,
}


[5] Beginning epoch.

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[0] Beginning epoch...
	Beginning evaluation...
		Running prediction...
		Computing metrics...
	Finished evaluation in 0:00:01s.
[0] Validation results:
{
  accuracy: 
    0.5545454545454546,
}


[1] Beginning epoch...
	Beginning evaluation...
		Running prediction...
		Computing metrics...
	Finished evaluation in 0:00:01s.
[1] Validation results:
{
  accuracy: 
    0.5545454545454546,
}


[2] Beginning epoch...
	Beginning evaluation...
		Running prediction...
		Computing metrics...
	Finished evaluation in 0:00:01s.
[2] Validation results:
{
  accuracy: 
    0.44545454545454544,
}


[3] Beginning epoch...
	Beginning evaluation...
		Running prediction...
		Computing metrics...
	Finished evaluation in 0:00:01s.
[3] Validation results:
{
  accuracy: 
    0.5545454545454546,
}


[4] Beginning epoch...
	Beginning evaluation...
		Running prediction...
		Computing metrics...
	Finished evaluation in 0:00:01s.
[4] Validation results:
{
  accuracy: 
    0.5545454545454546,
}


[5] Beginning epoch