#Watson submission EDA

In [None]:
#Review: building a Pytorch BERT model

In [None]:
!pip install transformers
!pip install sentencepiece
!pip install optuna

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.22.2-py3-none-any.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 15.9 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 68.5 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.9.0
  Downloading huggingface_hub-0.10.0-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 82.4 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.0 tokenizers-0.12.1 transformers-4.22.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)

In [None]:
!nvidia-smi

Mon Oct  3 23:55:56 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   30C    P0    23W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
import torch.nn as nn
import transformers
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

class cfg:

  #Data
  data_loc = 'PATH TO TRAINING DATA HERE'
  fold_df_loc = '/content/data_fold.csv'
  n_folds = 5

  #Model name
  model_name = 'joeddav/xlm-roberta-large-xnli'
  
  gradient_checkpointing = False
  gradient_accumulation_steps = 1
  
  #To use f16?
  apex = False
  
  #Define parameters
  batch_size = 8
  max_epochs = 3
  learning_rate = 5e-6

  #Early stopping
  patience = 1
  max_len = 100

  #Try out another loss:
  criterion = nn.CrossEntropyLoss()

  #Scheduler
  scheduler = 'linear'
  num_cycles = 0.5
  num_warmup_steps = 0

  #dropout
  hidden_dropout = 0.1
  hidden_dropout_prob = 0.1
  attention_dropout = 0.1
  attention_dropout_prob = 0.1

  #last layer id
  last_layer_id = -1



transformers.__version__: 4.22.2


In [None]:
import pandas as pd
from sklearn.model_selection import KFold

df = pd.read_csv(cfg.data_loc)

df.loc[:, 'kfold'] = -1
df.sample(frac=1, random_state=23).reset_index(drop=True)
labels = df.label.values

kfolds = KFold(n_splits = cfg.n_folds)

for fold, (trn,val) in enumerate(kfolds.split(X=df, y=labels)):
  df.loc[val, 'kfold'] = fold

cfg.train_len = len(df[df.kfold != 0])

df.to_csv("data_fold.csv", index=False)

#DataLoader

In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import numpy as np

class WatsonDataset(Dataset):
  def __init__(self, data, tokenizer, max_len, inference=False):
    
    self.len = len(data)
    self.max_len = max_len
    self.data = data
    self.tokenizer = tokenizer
    self.inference = inference

  def __getitem__(self, idx):
    
    #Store premise and hypothesis
    premis = self.data.premise[idx]
    hypoth = self.data.hypothesis[idx]

    #Combine into a single text string
    txt = premis + ' [SEP] ' + hypoth

    #Store label if not inference
    if self.inference == False:
      label = self.data.label[idx]

    #Encode with tokenizer
    encoding = self.tokenizer(txt, padding='max_length',
                              truncation=True, 
                              max_length=self.max_len)
    
    #Convert to tensor
    item = {key:torch.as_tensor(val) for key,val in encoding.items()}
    
    if self.inference == False:
      item['label'] = torch.as_tensor(label)
    
    return item

  def __len__(self):
    return self.len

# The model/Initialization

In [None]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings
    

class PooledCustomModel(nn.Module):

    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model_name, output_hidden_states=True)
            self.config.hidden_dropout = self.cfg.hidden_dropout
            self.config.hidden_dropout_prob = self.cfg.hidden_dropout_prob
            self.config.attention_dropout = self.cfg.attention_dropout
            self.config.attention_probs_dropout_prob = self.cfg.attention_dropout_prob
            self.config.layer_norm_eps = self.cfg.layer_norm_eps
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model_name, config=self.config)
        else:
            self.model = AutoModel(self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()
        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, 3)
        self._init_weights(self.fc)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[2][self.cfg.last_layer_id]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(feature)
        return output

class Learning: #Integrate option of changing loss
  def __init__(self, model, device, optimizer, cfg):
    self.model = model
    self.device = device
    self.optimizer = optimizer
    self.num_train_steps = cfg.train_len / cfg.batch_size * cfg.max_epochs

  def loss_fn(self, output, label):
    loss = nn.CrossEntropyLoss().to(self.device)
    return loss(output, label)

  def get_scheduler(self, cfg):
    if cfg.scheduler == 'linear':
        scheduler = get_linear_schedule_with_warmup(
            self.optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=self.num_train_steps
        )
    elif cfg.scheduler == 'cosine':
        scheduler = get_cosine_schedule_with_warmup(
            self.optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=self.num_train_steps, num_cycles=cfg.num_cycles
        )
    return scheduler

  def train_fn(self, dataloader):

    self.model.train()
    scheduler = self.get_scheduler(cfg)
    scaler = torch.cuda.amp.GradScaler(enabled=cfg.apex)

    #tracking
    step_loss = 0
    training_steps = 0
    pred_list, lab_list = [], []

    for data in dataloader:

      #To device
      input_ids = data['input_ids'].to(self.device)
      masks = data['attention_mask'].to(self.device)
      labels = data['label'].to(self.device)

      loss_fn = self.loss_fn

      #Calc logits and loss
      with torch.cuda.amp.autocast(enabled=cfg.apex):
        logits = self.model({'input_ids':input_ids, 'attention_mask':masks})
        loss = loss_fn(logits, labels)
      
      #Record step_loss and loss
      step_loss += loss.item()
      training_steps += 1
      if training_steps % 200 == 0:
        print(f"Training loss after {training_steps} training steps: {step_loss/(training_steps*cfg.batch_size)}")

      #Flatten/take argmax of logits
      flatten_labs = labels.view(-1).cpu().numpy()
      training_logits = logits.view(-1, 3)
      predictions = torch.argmax(training_logits, axis=1).cpu().numpy()

      #For accuracy calc
      pred_list.extend(predictions)
      lab_list.extend(flatten_labs)

      #gradient clipping
      torch.nn.utils.clip_grad_norm_(parameters=self.model.parameters(), max_norm=10)

      #Backward/optimizer/scheduler
      self.optimizer.zero_grad()
      
      if cfg.apex == True:
        scaler.scale(loss).backward()
        scaler.step(self.optimizer)
        scaler.update()

        self.optimizer.zero_grad()
        scheduler.step()
      
      else:
        loss.backward()
        self.optimizer.step()
        scheduler.step()

    #Calc acc
    train_acc = accuracy_score(lab_list, pred_list)
    train_loss = step_loss/training_steps
    return train_acc, train_loss

  def valid_fn(self, dataloader):

    self.model.eval()

    lab_list, pred_list = [], []
    val_loss = 0

    for data in dataloader:

      input_ids = data['input_ids'].to(self.device)
      masks = data['attention_mask'].to(self.device)
      labels = data['label'].to(self.device)

      logits = self.model({'input_ids':input_ids, 'attention_mask':masks})
      loss = self.loss_fn(logits, labels)
      val_loss += loss.item()

      lab_list.extend(labels.view(-1).cpu().numpy())
      pred_list.extend(torch.argmax(logits, axis=-1).cpu().numpy())

    val_acc = accuracy_score(lab_list, pred_list)
    val_loss = val_loss/len(lab_list)

    return val_acc, val_loss

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import time
import gc
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def run_experiment(fold, cfg, params, save_model=False):

  print(params)

  #Load model and tokenizer
  tokenizer = AutoTokenizer.from_pretrained(cfg.model_name, fast=True)

  #pull in df:
  df = pd.read_csv(cfg.fold_df_loc)

  #Pull out on the split
  train_df = df[df.kfold != fold].reset_index(drop=True)
  valid_df = df[df.kfold == fold].reset_index(drop=True)

  #Create datasets
  train_data = WatsonDataset(train_df, tokenizer, cfg.max_len)
  valid_data = WatsonDataset(valid_df, tokenizer, cfg.max_len)

  #Feed to dataloader
  train_loader = DataLoader(train_data, batch_size=cfg.batch_size, shuffle=True,
                                num_workers=2, pin_memory=True)
  valid_loader = DataLoader(valid_data, batch_size=cfg.batch_size, shuffle=True,
                                num_workers=2, pin_memory=True)


  #UPDATE CFG:
  cfg.num_training_steps = len(train_df)/cfg.batch_size*cfg.max_epochs

  cfg.attention_dropout = params['attention_dropout']
  cfg.attention_dropout_prob = params['attention_dropout_prob']
  cfg.hidden_dropout = params['hidden_dropout']
  cfg.hidden_dropout_prob = params['hidden_dropout_prob']
  cfg.last_layer_id = params['last_layer_id']
  cfg.learning_rate = params['learning_rate']
  cfg.layer_norm_eps = params['layer_norm_eps']


  #Calculate number of training steps: cfg
  model = PooledCustomModel(cfg, pretrained=True)
  model = model.to(device)

  #Select optimizer
  optimizer = torch.optim.AdamW(model.parameters(), lr=cfg.learning_rate)

  #Instantiate for training loop
  lrn = Learning(model, device, optimizer, cfg)

  best_acc = 0
  best_epoch = 0
  patience = 1
  no_imp = 0

  val_acc = []
  val_loss = []

  for epoch in range(cfg.max_epochs):
    train_acc, train_loss = lrn.train_fn(train_loader)
    valid_acc, valid_loss = lrn.valid_fn(valid_loader)
    
    val_acc.append(valid_acc)
    val_loss.append(valid_loss)

    print(f'validation acc for epoch {epoch}: {valid_acc}')
    if valid_acc > best_acc:
      best_acc = valid_acc
      best_loss = valid_loss
      best_epoch = epoch
    
    else:
      no_imp += 1
      if no_imp >= patience:
        break

  #Clean up
  model = None
  torch.cuda.empty_cache()
  gc.collect()

  return best_acc, best_loss, best_epoch

In [None]:
import optuna

def objective(trial):
  params = {
      "layer_norm_eps": trial.suggest_float("layer_norm_eps", 1e-8, 1e-4),
      "attention_dropout": trial.suggest_float("attention_dropout", .1, .5),
      "attention_dropout_prob": trial.suggest_float("attention_dropout_prob", .1, .5),
      "hidden_dropout": trial.suggest_float("hidden_dropout", .1, .5),
      "hidden_dropout_prob": trial.suggest_float("hidden_dropout_prob", .1, .5),
      "last_layer_id": trial.suggest_int("last_layer_id", -3, -1),
      "learning_rate": trial.suggest_float("learning_rate", 5e-6, 1e-5)
      }

  all_losses = []

  for f_ in range(cfg.n_folds):
    best_acc, best_loss, best_epoch = run_experiment(f_, cfg, params)
    all_losses.append(best_loss)

  return np.mean(all_losses)

In [None]:
study = optuna.create_study(direction = 'minimize')
study.optimize(objective, n_trials=20)
print("Best Trial:")
trial_ = study.best_trial
print(trial_.values)

[32m[I 2022-10-03 23:54:46,008][0m A new study created in memory with name: no-name-a20b1ba9-638d-4799-b831-710126825a52[0m


{'layer_norm_eps': 3.037430959850435e-05, 'attention_dropout': 0.3813872637346437, 'attention_dropout_prob': 0.32310615290183314, 'hidden_dropout': 0.36452341638266383, 'hidden_dropout_prob': 0.1219189099004625, 'last_layer_id': -3, 'learning_rate': 7.768328378877015e-06}


Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaModel: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training loss after 200 training steps: 0.05240012298570946
Training loss after 400 training steps: 0.04565173692069948
Training loss after 600 training steps: 0.04373088080358381
Training loss after 800 training steps: 0.043266596817120445
Training loss after 1000 training steps: 0.042002600682899356
Training loss after 1200 training steps: 0.041971399336859276
validation acc for epoch 0: 0.9253300330033003
Training loss after 200 training steps: 0.026895898627117276
Training loss after 400 training steps: 0.029071907158649993
Training loss after 600 training steps: 0.028707180234778206
Training loss after 800 training steps: 0.028923143827123566
Training loss after 1000 training steps: 0.029978673623350913
Training loss after 1200 training steps: 0.029940540409346187
validation acc for epoch 1: 0.9141914191419142
{'layer_norm_eps': 3.037430959850435e-05, 'attention_dropout': 0.3813872637346437, 'attention_dropout_prob': 0.32310615290183314, 'hidden_dropout': 0.36452341638266383, 'hid

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaModel: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training loss after 200 training steps: 0.04817069942713715
Training loss after 400 training steps: 0.045784362761187365
Training loss after 600 training steps: 0.04511855688140107
Training loss after 800 training steps: 0.044264989422663346
Training loss after 1000 training steps: 0.04429818662640173
Training loss after 1200 training steps: 0.04314874469525724
validation acc for epoch 0: 0.9245049504950495
Training loss after 200 training steps: 0.026952195964986457
Training loss after 400 training steps: 0.029648557736363726
Training loss after 600 training steps: 0.030253031167279308
Training loss after 800 training steps: 0.03004054684097355
Training loss after 1000 training steps: 0.02980239337554667
Training loss after 1200 training steps: 0.029824901642326342
validation acc for epoch 1: 0.9195544554455446
{'layer_norm_eps': 3.037430959850435e-05, 'attention_dropout': 0.3813872637346437, 'attention_dropout_prob': 0.32310615290183314, 'hidden_dropout': 0.36452341638266383, 'hidden

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaModel: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training loss after 200 training steps: 0.05079622641555034
Training loss after 400 training steps: 0.04760003443050664
Training loss after 600 training steps: 0.04540032169199549
Training loss after 800 training steps: 0.04531497971562203
Training loss after 1000 training steps: 0.043735413489397613
Training loss after 1200 training steps: 0.043456723699928264
validation acc for epoch 0: 0.9170792079207921
Training loss after 200 training steps: 0.027670985938457307
Training loss after 400 training steps: 0.028666792707517742
Training loss after 600 training steps: 0.029862507641276657
Training loss after 800 training steps: 0.0299383040383691
Training loss after 1000 training steps: 0.029718215139582754
Training loss after 1200 training steps: 0.029609256342834366
validation acc for epoch 1: 0.9220297029702971
Training loss after 200 training steps: 0.018830382503801958
Training loss after 400 training steps: 0.01936304806280532
Training loss after 600 training steps: 0.0200280742886

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaModel: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training loss after 200 training steps: 0.05099735717056319
Training loss after 400 training steps: 0.04719986035168404
Training loss after 600 training steps: 0.04568605317598364
Training loss after 800 training steps: 0.0442430685776344
Training loss after 1000 training steps: 0.042961562667042014
Training loss after 1200 training steps: 0.042895736889719656
validation acc for epoch 0: 0.9174917491749175
Training loss after 200 training steps: 0.029928183449083007
Training loss after 400 training steps: 0.029233838724612726
Training loss after 600 training steps: 0.029962817138584796
Training loss after 800 training steps: 0.029901252031631884
Training loss after 1000 training steps: 0.03028202789957868
Training loss after 1200 training steps: 0.03070241453742104
validation acc for epoch 1: 0.9154290429042904
{'layer_norm_eps': 3.037430959850435e-05, 'attention_dropout': 0.3813872637346437, 'attention_dropout_prob': 0.32310615290183314, 'hidden_dropout': 0.36452341638266383, 'hidden_

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaModel: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training loss after 200 training steps: 0.05067806611536071
Training loss after 400 training steps: 0.047374278649222105
Training loss after 600 training steps: 0.04578897397654752
Training loss after 800 training steps: 0.04446756251680199
Training loss after 1000 training steps: 0.04355558087304234
Training loss after 1200 training steps: 0.04296402130945353
validation acc for epoch 0: 0.9117161716171617
Training loss after 200 training steps: 0.03220942551968619
Training loss after 400 training steps: 0.03138928708882304
Training loss after 600 training steps: 0.03124208100765827
Training loss after 800 training steps: 0.030343366998931743
Training loss after 1000 training steps: 0.029799531251890585
Training loss after 1200 training steps: 0.029963363035058137
validation acc for epoch 1: 0.9104785478547854


[32m[I 2022-10-04 02:37:15,219][0m Trial 0 finished with value: 0.029976652201519443 and parameters: {'layer_norm_eps': 3.037430959850435e-05, 'attention_dropout': 0.3813872637346437, 'attention_dropout_prob': 0.32310615290183314, 'hidden_dropout': 0.36452341638266383, 'hidden_dropout_prob': 0.1219189099004625, 'last_layer_id': -3, 'learning_rate': 7.768328378877015e-06}. Best is trial 0 with value: 0.029976652201519443.[0m


{'layer_norm_eps': 5.882039071882567e-05, 'attention_dropout': 0.22427143924478443, 'attention_dropout_prob': 0.13621338900646426, 'hidden_dropout': 0.2268176631531661, 'hidden_dropout_prob': 0.3483826828318769, 'last_layer_id': -1, 'learning_rate': 8.62331744832147e-06}


Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaModel: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training loss after 200 training steps: 0.08559100503101945
Training loss after 400 training steps: 0.07658629219513387
Training loss after 600 training steps: 0.07331825766557207
Training loss after 800 training steps: 0.07112116862437688
Training loss after 1000 training steps: 0.07014908698294312
Training loss after 1200 training steps: 0.0685210584239879
validation acc for epoch 0: 0.9088283828382838
Training loss after 200 training steps: 0.05596067225327715
Training loss after 400 training steps: 0.05639878835412673
Training loss after 600 training steps: 0.056488476311787966
Training loss after 800 training steps: 0.056380391898564995
Training loss after 1000 training steps: 0.0560075896740891
Training loss after 1200 training steps: 0.05635783700272441
validation acc for epoch 1: 0.905940594059406
{'layer_norm_eps': 5.882039071882567e-05, 'attention_dropout': 0.22427143924478443, 'attention_dropout_prob': 0.13621338900646426, 'hidden_dropout': 0.2268176631531661, 'hidden_dropou

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaModel: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training loss after 200 training steps: 0.07954822574742139
Training loss after 400 training steps: 0.07450311550172045
Training loss after 600 training steps: 0.0714525808347389
Training loss after 800 training steps: 0.0709362655109726
Training loss after 1000 training steps: 0.07008114586211742
Training loss after 1200 training steps: 0.06837617691218233
validation acc for epoch 0: 0.9141914191419142
Training loss after 200 training steps: 0.05426030838629231
Training loss after 400 training steps: 0.05589531335281208
Training loss after 600 training steps: 0.05594319388231573
Training loss after 800 training steps: 0.056737513845146165
Training loss after 1000 training steps: 0.05663938552397303
Training loss after 1200 training steps: 0.05616333552345168
validation acc for epoch 1: 0.9187293729372937
Training loss after 200 training steps: 0.04851053897989914
Training loss after 400 training steps: 0.05037237345823087
Training loss after 600 training steps: 0.05069331761139135
Tra

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaModel: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training loss after 200 training steps: 0.07635021857451647
Training loss after 400 training steps: 0.07035492507740855
Training loss after 600 training steps: 0.06836057785976057
Training loss after 800 training steps: 0.06688291124359239
Training loss after 1000 training steps: 0.06701865461515263
Training loss after 1200 training steps: 0.06670776787097565
validation acc for epoch 0: 0.9121287128712872
Training loss after 200 training steps: 0.05774240296334028
Training loss after 400 training steps: 0.05715031647821888
Training loss after 600 training steps: 0.05708403179422021
Training loss after 800 training steps: 0.057179416125291024
Training loss after 1000 training steps: 0.05700548685481772
Training loss after 1200 training steps: 0.056281294275152806
validation acc for epoch 1: 0.9154290429042904
Training loss after 200 training steps: 0.052971671691630036
Training loss after 400 training steps: 0.05159858396276831
Training loss after 600 training steps: 0.05143608266487717

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaModel: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training loss after 200 training steps: 0.08359318607486785
Training loss after 400 training steps: 0.07743402203777805
Training loss after 600 training steps: 0.07206954896450042
Training loss after 800 training steps: 0.07067201011115685
Training loss after 1000 training steps: 0.0697199559994042
Training loss after 1200 training steps: 0.06886800506617874
validation acc for epoch 0: 0.9125412541254125
Training loss after 200 training steps: 0.05537958695087582
Training loss after 400 training steps: 0.05834010514197871
Training loss after 600 training steps: 0.059076927249940736
Training loss after 800 training steps: 0.05868692144169472
Training loss after 1000 training steps: 0.05777981521934271
Training loss after 1200 training steps: 0.057099268607174354
validation acc for epoch 1: 0.9001650165016502
{'layer_norm_eps': 5.882039071882567e-05, 'attention_dropout': 0.22427143924478443, 'attention_dropout_prob': 0.13621338900646426, 'hidden_dropout': 0.2268176631531661, 'hidden_drop

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaModel: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training loss after 200 training steps: 0.08494746808428318
Training loss after 400 training steps: 0.07791691597551108
Training loss after 600 training steps: 0.07466320392830918
Training loss after 800 training steps: 0.07271799257607199
Training loss after 1000 training steps: 0.07112711454369128
Training loss after 1200 training steps: 0.0699909597208413
validation acc for epoch 0: 0.9075907590759076
Training loss after 200 training steps: 0.055909174818079915
Training loss after 400 training steps: 0.056931074677268045
Training loss after 600 training steps: 0.05720066515573611
Training loss after 800 training steps: 0.05748404213576577
Training loss after 1000 training steps: 0.05669977977615781
Training loss after 1200 training steps: 0.05650999823138894
validation acc for epoch 1: 0.9141914191419142
Training loss after 200 training steps: 0.05172259207814932
Training loss after 400 training steps: 0.049817536434857174
Training loss after 600 training steps: 0.05051191931900879


[32m[I 2022-10-04 05:57:26,247][0m Trial 1 finished with value: 0.03806280505416221 and parameters: {'layer_norm_eps': 5.882039071882567e-05, 'attention_dropout': 0.22427143924478443, 'attention_dropout_prob': 0.13621338900646426, 'hidden_dropout': 0.2268176631531661, 'hidden_dropout_prob': 0.3483826828318769, 'last_layer_id': -1, 'learning_rate': 8.62331744832147e-06}. Best is trial 0 with value: 0.029976652201519443.[0m


{'layer_norm_eps': 2.5379247386742868e-05, 'attention_dropout': 0.1705550338561245, 'attention_dropout_prob': 0.30373598800048485, 'hidden_dropout': 0.14153848628076127, 'hidden_dropout_prob': 0.3703954684927171, 'last_layer_id': -1, 'learning_rate': 8.344314563988124e-06}


Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaModel: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training loss after 200 training steps: 0.10890110563486814
Training loss after 400 training steps: 0.10283612472005188
Training loss after 600 training steps: 0.09859679120903214
Training loss after 800 training steps: 0.09703751323511824
Training loss after 1000 training steps: 0.09528508994542062
Training loss after 1200 training steps: 0.09422580251004548
validation acc for epoch 0: 0.908003300330033
Training loss after 200 training steps: 0.08042883669491857
Training loss after 400 training steps: 0.08113507724367082
Training loss after 600 training steps: 0.08194841617097458
Training loss after 800 training steps: 0.0811292042862624
Training loss after 1000 training steps: 0.08156122187152505
Training loss after 1200 training steps: 0.08142828714257727
validation acc for epoch 1: 0.9042904290429042
{'layer_norm_eps': 2.5379247386742868e-05, 'attention_dropout': 0.1705550338561245, 'attention_dropout_prob': 0.30373598800048485, 'hidden_dropout': 0.14153848628076127, 'hidden_dropou

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaModel: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training loss after 200 training steps: 0.11649884650483727
Training loss after 400 training steps: 0.10884339011274279
Training loss after 600 training steps: 0.10483689291713139
Training loss after 800 training steps: 0.1015253814170137
Training loss after 1000 training steps: 0.09897766532190144
Training loss after 1200 training steps: 0.09780723388772458
validation acc for epoch 0: 0.9067656765676567
Training loss after 200 training steps: 0.08630760733503848
Training loss after 400 training steps: 0.08577276652446017
Training loss after 600 training steps: 0.08584177759941668
Training loss after 800 training steps: 0.08389448352041655
Training loss after 1000 training steps: 0.0835936710559763
Training loss after 1200 training steps: 0.0836248892332272
validation acc for epoch 1: 0.893976897689769
{'layer_norm_eps': 2.5379247386742868e-05, 'attention_dropout': 0.1705550338561245, 'attention_dropout_prob': 0.30373598800048485, 'hidden_dropout': 0.14153848628076127, 'hidden_dropout_

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaModel: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training loss after 200 training steps: 0.11376723995432257
Training loss after 400 training steps: 0.10709731337614357
Training loss after 600 training steps: 0.10464404935017228
Training loss after 800 training steps: 0.10201039898442105
Training loss after 1000 training steps: 0.10021954729408025
Training loss after 1200 training steps: 0.09843914499506354
validation acc for epoch 0: 0.9088283828382838
Training loss after 200 training steps: 0.08366468264721334
Training loss after 400 training steps: 0.083945600730367
Training loss after 600 training steps: 0.08420520004195471
Training loss after 800 training steps: 0.08326891044387594
Training loss after 1000 training steps: 0.08342228520847857
Training loss after 1200 training steps: 0.08404720510200908
validation acc for epoch 1: 0.9071782178217822
{'layer_norm_eps': 2.5379247386742868e-05, 'attention_dropout': 0.1705550338561245, 'attention_dropout_prob': 0.30373598800048485, 'hidden_dropout': 0.14153848628076127, 'hidden_dropou

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaModel: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training loss after 200 training steps: 0.1068546899035573
Training loss after 400 training steps: 0.10060257314704359
Training loss after 600 training steps: 0.09867275383944313
Training loss after 800 training steps: 0.09671104257460683
Training loss after 1000 training steps: 0.0947219348885119
Training loss after 1200 training steps: 0.0940408895087118
validation acc for epoch 0: 0.9026402640264026
Training loss after 200 training steps: 0.08383742458187043
Training loss after 400 training steps: 0.08383683062624186
Training loss after 600 training steps: 0.08382696219409505
Training loss after 800 training steps: 0.08348571942886338
Training loss after 1000 training steps: 0.08286230299342423
Training loss after 1200 training steps: 0.08247606230666861
validation acc for epoch 1: 0.9018151815181518
{'layer_norm_eps': 2.5379247386742868e-05, 'attention_dropout': 0.1705550338561245, 'attention_dropout_prob': 0.30373598800048485, 'hidden_dropout': 0.14153848628076127, 'hidden_dropout

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaModel: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training loss after 200 training steps: 0.11614036772400141
Training loss after 400 training steps: 0.10803137256763876
Training loss after 600 training steps: 0.10381411035234729
Training loss after 800 training steps: 0.10097948839655146
Training loss after 1000 training steps: 0.10004953932017088
Training loss after 1200 training steps: 0.09830295187265922
validation acc for epoch 0: 0.9055280528052805
Training loss after 200 training steps: 0.08777929798699916
Training loss after 400 training steps: 0.08598630333784968
Training loss after 600 training steps: 0.08581925698866447
Training loss after 800 training steps: 0.08489054083125666
Training loss after 1000 training steps: 0.08408131841570139
Training loss after 1200 training steps: 0.08340118273471793
validation acc for epoch 1: 0.9084158415841584
Training loss after 200 training steps: 0.07769494107458741
Training loss after 400 training steps: 0.0783567357226275
Training loss after 600 training steps: 0.07844763101544232
Tra

[32m[I 2022-10-04 08:47:01,311][0m Trial 2 finished with value: 0.040441166894474266 and parameters: {'layer_norm_eps': 2.5379247386742868e-05, 'attention_dropout': 0.1705550338561245, 'attention_dropout_prob': 0.30373598800048485, 'hidden_dropout': 0.14153848628076127, 'hidden_dropout_prob': 0.3703954684927171, 'last_layer_id': -1, 'learning_rate': 8.344314563988124e-06}. Best is trial 0 with value: 0.029976652201519443.[0m


{'layer_norm_eps': 6.375650973803733e-05, 'attention_dropout': 0.42825411182346473, 'attention_dropout_prob': 0.2879274268206374, 'hidden_dropout': 0.1840338572204422, 'hidden_dropout_prob': 0.293527886691375, 'last_layer_id': -1, 'learning_rate': 6.8743790886822445e-06}


Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaModel: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training loss after 200 training steps: 0.07429959315340966
Training loss after 400 training steps: 0.07278370465384797
Training loss after 600 training steps: 0.06999499885210146
Training loss after 800 training steps: 0.0685775124450447
Training loss after 1000 training steps: 0.0667332432908006
Training loss after 1200 training steps: 0.06652898608900917
validation acc for epoch 0: 0.9187293729372937
Training loss after 200 training steps: 0.05205475815921091
Training loss after 400 training steps: 0.05220821281604003
Training loss after 600 training steps: 0.0533153486410932
Training loss after 800 training steps: 0.054234366747550666
Training loss after 1000 training steps: 0.05397410627175123
Training loss after 1200 training steps: 0.05335689441883005
validation acc for epoch 1: 0.9228547854785478
Training loss after 200 training steps: 0.044418180956854486
Training loss after 400 training steps: 0.046215515568328557
Training loss after 600 training steps: 0.04735559452189288
Tr

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaModel: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training loss after 200 training steps: 0.07564115227200091
Training loss after 400 training steps: 0.07011958425631747
Training loss after 600 training steps: 0.06791665824207788
Training loss after 800 training steps: 0.06636484190297778
Training loss after 1000 training steps: 0.06563937350315974
Training loss after 1200 training steps: 0.0641645145715059
validation acc for epoch 0: 0.9174917491749175
Training loss after 200 training steps: 0.052610508078942075
Training loss after 400 training steps: 0.05427556356822606
Training loss after 600 training steps: 0.053700804344456024
Training loss after 800 training steps: 0.05336618671281031
Training loss after 1000 training steps: 0.053718779475195336
Training loss after 1200 training steps: 0.05284469097435552
validation acc for epoch 1: 0.9166666666666666
{'layer_norm_eps': 6.375650973803733e-05, 'attention_dropout': 0.42825411182346473, 'attention_dropout_prob': 0.2879274268206374, 'hidden_dropout': 0.1840338572204422, 'hidden_drop

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaModel: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training loss after 200 training steps: 0.07309150097891688
Training loss after 400 training steps: 0.07066554232733324
Training loss after 600 training steps: 0.0681154410704039
Training loss after 800 training steps: 0.06703232609957922
Training loss after 1000 training steps: 0.06646822433406487
Training loss after 1200 training steps: 0.06514925836391437
validation acc for epoch 0: 0.9137788778877888
Training loss after 200 training steps: 0.0492925199912861
Training loss after 400 training steps: 0.05378982972819358
Training loss after 600 training steps: 0.053714942826579015
Training loss after 800 training steps: 0.0526278467332304
Training loss after 1000 training steps: 0.052630616333452056
Training loss after 1200 training steps: 0.05275370609024928
validation acc for epoch 1: 0.9158415841584159
Training loss after 200 training steps: 0.04739596599247307
Training loss after 400 training steps: 0.046041282893856984
Training loss after 600 training steps: 0.04739649157505482
Tr

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaModel: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training loss after 200 training steps: 0.08278561639832333
Training loss after 400 training steps: 0.07650208684732206
Training loss after 600 training steps: 0.07313729486195371
Training loss after 800 training steps: 0.07052558352064807
Training loss after 1000 training steps: 0.06849612043285742
Training loss after 1200 training steps: 0.06672013149325115
validation acc for epoch 0: 0.9084158415841584
Training loss after 200 training steps: 0.056398420054465534
Training loss after 400 training steps: 0.054674887547735126
Training loss after 600 training steps: 0.055236247105058284
Training loss after 800 training steps: 0.05528045467712218
Training loss after 1000 training steps: 0.05515342596708797
Training loss after 1200 training steps: 0.05510562890539101
validation acc for epoch 1: 0.9108910891089109
Training loss after 200 training steps: 0.047137490534223614
Training loss after 400 training steps: 0.048053082898259165
Training loss after 600 training steps: 0.048929582845109

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaModel: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training loss after 200 training steps: 0.07782729270868004
Training loss after 400 training steps: 0.07393110897392034
Training loss after 600 training steps: 0.07120606805197895
Training loss after 800 training steps: 0.06866618008527439
Training loss after 1000 training steps: 0.06713153596455232
Training loss after 1200 training steps: 0.06578194921952672
validation acc for epoch 0: 0.9154290429042904
Training loss after 200 training steps: 0.056994543364271524
Training loss after 400 training steps: 0.054349488178268075
Training loss after 600 training steps: 0.05497161320100228
Training loss after 800 training steps: 0.054797968014609066
Training loss after 1000 training steps: 0.05447969533340074
Training loss after 1200 training steps: 0.05416261882346589
validation acc for epoch 1: 0.9063531353135313


[32m[I 2022-10-04 12:07:19,206][0m Trial 3 finished with value: 0.03681181979878758 and parameters: {'layer_norm_eps': 6.375650973803733e-05, 'attention_dropout': 0.42825411182346473, 'attention_dropout_prob': 0.2879274268206374, 'hidden_dropout': 0.1840338572204422, 'hidden_dropout_prob': 0.293527886691375, 'last_layer_id': -1, 'learning_rate': 6.8743790886822445e-06}. Best is trial 0 with value: 0.029976652201519443.[0m


{'layer_norm_eps': 4.050461696302612e-05, 'attention_dropout': 0.18295876771253888, 'attention_dropout_prob': 0.2632682319084323, 'hidden_dropout': 0.18742515587640418, 'hidden_dropout_prob': 0.49722016841474825, 'last_layer_id': -3, 'learning_rate': 5.593318459545958e-06}


Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaModel: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training loss after 200 training steps: 0.13902735441923142
Training loss after 400 training steps: 0.13827310448512434
Training loss after 600 training steps: 0.1381833571071426
Training loss after 800 training steps: 0.1379712153505534
Training loss after 1000 training steps: 0.1378848413825035
Training loss after 1200 training steps: 0.13794133626545468
validation acc for epoch 0: 0.643976897689769
Training loss after 200 training steps: 0.13774847023189069
Training loss after 400 training steps: 0.13771587606519461
Training loss after 600 training steps: 0.13779234364628792
Training loss after 800 training steps: 0.1379703204892576
Training loss after 1000 training steps: 0.1379681919515133
Training loss after 1200 training steps: 0.13789924360811712
validation acc for epoch 1: 0.7186468646864687
Training loss after 200 training steps: 0.1380418971180916
Training loss after 400 training steps: 0.1380064807087183
Training loss after 600 training steps: 0.13793427004168432
Training l

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaModel: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training loss after 200 training steps: 0.13930585004389287
Training loss after 400 training steps: 0.13903479700908064
Training loss after 600 training steps: 0.13877135891467332
Training loss after 800 training steps: 0.1385909024439752
Training loss after 1000 training steps: 0.13847685950994493
Training loss after 1200 training steps: 0.13843224133054416
validation acc for epoch 0: 0.4414191419141914
Training loss after 200 training steps: 0.13772949520498515
Training loss after 400 training steps: 0.13798018224537373
Training loss after 600 training steps: 0.1379326473424832
Training loss after 800 training steps: 0.13799613348208367
Training loss after 1000 training steps: 0.1380549480021
Training loss after 1200 training steps: 0.13804543318847814
validation acc for epoch 1: 0.4438943894389439
Training loss after 200 training steps: 0.1381836212798953
Training loss after 400 training steps: 0.13816043535247446
Training loss after 600 training steps: 0.13811050026367108
Training 

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaModel: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training loss after 200 training steps: 0.13984859071671962
Training loss after 400 training steps: 0.1393238328769803
Training loss after 600 training steps: 0.13901053259770074
Training loss after 800 training steps: 0.13890216570347547
Training loss after 1000 training steps: 0.1387344343960285
Training loss after 1200 training steps: 0.13859936326121292
validation acc for epoch 0: 0.5420792079207921
Training loss after 200 training steps: 0.13763473358005285
Training loss after 400 training steps: 0.1380191671475768
Training loss after 600 training steps: 0.13802861080815396
Training loss after 800 training steps: 0.13800246758386492
Training loss after 1000 training steps: 0.13799465897679328
Training loss after 1200 training steps: 0.13793404676641027
validation acc for epoch 1: 0.5057755775577558
{'layer_norm_eps': 4.050461696302612e-05, 'attention_dropout': 0.18295876771253888, 'attention_dropout_prob': 0.2632682319084323, 'hidden_dropout': 0.18742515587640418, 'hidden_dropout_

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaModel: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training loss after 200 training steps: 0.1397576016932726
Training loss after 400 training steps: 0.1393242248147726
Training loss after 600 training steps: 0.13909682261447112
Training loss after 800 training steps: 0.1389015629608184
Training loss after 1000 training steps: 0.13888612492382527
Training loss after 1200 training steps: 0.13877773318439723
validation acc for epoch 0: 0.32755775577557755
Training loss after 200 training steps: 0.13835772942751645
Training loss after 400 training steps: 0.13845769930630922
Training loss after 600 training steps: 0.13860088373223942
Training loss after 800 training steps: 0.13855194711126387
Training loss after 1000 training steps: 0.13850034563988448
Training loss after 1200 training steps: 0.13846086974566182
validation acc for epoch 1: 0.408003300330033
Training loss after 200 training steps: 0.13843766309320926
Training loss after 400 training steps: 0.1382936557754874
Training loss after 600 training steps: 0.1382338222116232
Trainin

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaModel: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training loss after 200 training steps: 0.13904252126812935
Training loss after 400 training steps: 0.13887168934568764
Training loss after 600 training steps: 0.1386193286255002
Training loss after 800 training steps: 0.13833133475854992
Training loss after 1000 training steps: 0.13823990152031182
Training loss after 1200 training steps: 0.13813990458225212
validation acc for epoch 0: 0.36303630363036304
Training loss after 200 training steps: 0.13807430148124694
Training loss after 400 training steps: 0.13808841494843363
Training loss after 600 training steps: 0.13784191321581601
Training loss after 800 training steps: 0.13773713489063083
Training loss after 1000 training steps: 0.13777366145700215
Training loss after 1200 training steps: 0.13782341156775751
validation acc for epoch 1: 0.44636963696369636
Training loss after 200 training steps: 0.13785799600183965
Training loss after 400 training steps: 0.1378373960033059
Training loss after 600 training steps: 0.13790249121685824
Tr

[32m[I 2022-10-04 15:33:37,701][0m Trial 4 finished with value: 0.12788211210037614 and parameters: {'layer_norm_eps': 4.050461696302612e-05, 'attention_dropout': 0.18295876771253888, 'attention_dropout_prob': 0.2632682319084323, 'hidden_dropout': 0.18742515587640418, 'hidden_dropout_prob': 0.49722016841474825, 'last_layer_id': -3, 'learning_rate': 5.593318459545958e-06}. Best is trial 0 with value: 0.029976652201519443.[0m


{'layer_norm_eps': 4.4066207025251415e-05, 'attention_dropout': 0.21855051048430157, 'attention_dropout_prob': 0.33156994148908525, 'hidden_dropout': 0.11616503165025507, 'hidden_dropout_prob': 0.11487138562729991, 'last_layer_id': -1, 'learning_rate': 9.281695120611431e-06}


Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaModel: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training loss after 200 training steps: 0.05079983341274783
Training loss after 400 training steps: 0.04764340027642902
Training loss after 600 training steps: 0.044964443278149704
Training loss after 800 training steps: 0.045433155931386866
Training loss after 1000 training steps: 0.0449886674520094
Training loss after 1200 training steps: 0.0444338748556523
validation acc for epoch 0: 0.9183168316831684
Training loss after 200 training steps: 0.02952438687090762
Training loss after 400 training steps: 0.03308701814909
Training loss after 600 training steps: 0.032380030226292246
Training loss after 800 training steps: 0.032162701544511944
Training loss after 1000 training steps: 0.03192894394992618
Training loss after 1200 training steps: 0.03199906297018364
validation acc for epoch 1: 0.9179042904290429
{'layer_norm_eps': 4.4066207025251415e-05, 'attention_dropout': 0.21855051048430157, 'attention_dropout_prob': 0.33156994148908525, 'hidden_dropout': 0.11616503165025507, 'hidden_drop

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaModel: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training loss after 200 training steps: 0.052728720334125685
Training loss after 400 training steps: 0.04861236622440629
Training loss after 600 training steps: 0.04701065598987043
Training loss after 800 training steps: 0.046402938847459156
Training loss after 1000 training steps: 0.045251844686979896
Training loss after 1200 training steps: 0.04428677329099931
validation acc for epoch 0: 0.9125412541254125
Training loss after 200 training steps: 0.02906024880008772
Training loss after 400 training steps: 0.03066111563704908
Training loss after 600 training steps: 0.03050659780468171
Training loss after 800 training steps: 0.03134957229915017
Training loss after 1000 training steps: 0.03139693514123792
Training loss after 1200 training steps: 0.031755023492393473
validation acc for epoch 1: 0.9224422442244224
Training loss after 200 training steps: 0.022858807262746268
Training loss after 400 training steps: 0.023286310807416156
Training loss after 600 training steps: 0.02280543576996

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaModel: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training loss after 200 training steps: 0.04955141397891566
Training loss after 400 training steps: 0.04755690866441
Training loss after 600 training steps: 0.04702706533134915
Training loss after 800 training steps: 0.04648580389388371
Training loss after 1000 training steps: 0.04438525993691292
Training loss after 1200 training steps: 0.043296874879888494
validation acc for epoch 0: 0.9212046204620462
Training loss after 200 training steps: 0.03188735399307916
Training loss after 400 training steps: 0.0321890661182988
Training loss after 600 training steps: 0.03114237418281846
Training loss after 800 training steps: 0.03132565644860733
Training loss after 1000 training steps: 0.03106579195521772
Training loss after 1200 training steps: 0.03073744807940481
validation acc for epoch 1: 0.9096534653465347
{'layer_norm_eps': 4.4066207025251415e-05, 'attention_dropout': 0.21855051048430157, 'attention_dropout_prob': 0.33156994148908525, 'hidden_dropout': 0.11616503165025507, 'hidden_dropou

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaModel: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training loss after 200 training steps: 0.04821391175268218
Training loss after 400 training steps: 0.04553309990442358
Training loss after 600 training steps: 0.043549319713298854
Training loss after 800 training steps: 0.04377233237057226
Training loss after 1000 training steps: 0.04325152118253754
Training loss after 1200 training steps: 0.04342820486902686
validation acc for epoch 0: 0.9129537953795379
Training loss after 200 training steps: 0.034929687596159054
Training loss after 400 training steps: 0.034985626788984515
Training loss after 600 training steps: 0.033559558923298025
Training loss after 800 training steps: 0.03365001018646581
Training loss after 1000 training steps: 0.032652427372697274
Training loss after 1200 training steps: 0.03269215205097377
validation acc for epoch 1: 0.9146039603960396
Training loss after 200 training steps: 0.022927603650896345
Training loss after 400 training steps: 0.025012309740995987
Training loss after 600 training steps: 0.0248496178842

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaModel: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training loss after 200 training steps: 0.0479722162510734
Training loss after 400 training steps: 0.04528508842835435
Training loss after 600 training steps: 0.043339842556742954
Training loss after 800 training steps: 0.043260445270425406
Training loss after 1000 training steps: 0.042863275597221216
Training loss after 1200 training steps: 0.042413191689314164
validation acc for epoch 0: 0.9191419141914191
Training loss after 200 training steps: 0.02587222912290599
Training loss after 400 training steps: 0.02778377853668644
Training loss after 600 training steps: 0.029400093770139694
Training loss after 800 training steps: 0.029202694101913947
Training loss after 1000 training steps: 0.03010815195000032
Training loss after 1200 training steps: 0.03083967467736026
validation acc for epoch 1: 0.9051155115511551


[32m[I 2022-10-04 18:38:36,426][0m Trial 5 finished with value: 0.02980479276846639 and parameters: {'layer_norm_eps': 4.4066207025251415e-05, 'attention_dropout': 0.21855051048430157, 'attention_dropout_prob': 0.33156994148908525, 'hidden_dropout': 0.11616503165025507, 'hidden_dropout_prob': 0.11487138562729991, 'last_layer_id': -1, 'learning_rate': 9.281695120611431e-06}. Best is trial 5 with value: 0.02980479276846639.[0m


{'layer_norm_eps': 8.062777030495783e-05, 'attention_dropout': 0.4855381495921697, 'attention_dropout_prob': 0.4467636640833267, 'hidden_dropout': 0.22779719425671316, 'hidden_dropout_prob': 0.2652048337141848, 'last_layer_id': -2, 'learning_rate': 7.147419945425295e-06}


Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaModel: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training loss after 200 training steps: 0.09742305137217044
Training loss after 400 training steps: 0.09168182430323213
Training loss after 600 training steps: 0.08773213522819182
Training loss after 800 training steps: 0.08553662926191465
Training loss after 1000 training steps: 0.08394123253691942
Training loss after 1200 training steps: 0.08241845713462681
validation acc for epoch 0: 0.9154290429042904
Training loss after 200 training steps: 0.06788180741015822
Training loss after 400 training steps: 0.07091365102678537
Training loss after 600 training steps: 0.07115379504859448
Training loss after 800 training steps: 0.07180568415438757
Training loss after 1000 training steps: 0.07139445341564715
Training loss after 1200 training steps: 0.07061976366249534
validation acc for epoch 1: 0.9121287128712872
{'layer_norm_eps': 8.062777030495783e-05, 'attention_dropout': 0.4855381495921697, 'attention_dropout_prob': 0.4467636640833267, 'hidden_dropout': 0.22779719425671316, 'hidden_dropou

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaModel: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training loss after 200 training steps: 0.10210991274565458
Training loss after 400 training steps: 0.09413309387397022
Training loss after 600 training steps: 0.09102630540418129
Training loss after 800 training steps: 0.08826271231053397
Training loss after 1000 training steps: 0.0855296936975792
Training loss after 1200 training steps: 0.08377986823751901
validation acc for epoch 0: 0.9141914191419142
Training loss after 200 training steps: 0.07211788523010909
Training loss after 400 training steps: 0.06968048721784725
Training loss after 600 training steps: 0.06905641495560606
Training loss after 800 training steps: 0.06913015470607206
Training loss after 1000 training steps: 0.06828059875033796
Training loss after 1200 training steps: 0.0675842479915203
validation acc for epoch 1: 0.9100660066006601
{'layer_norm_eps': 8.062777030495783e-05, 'attention_dropout': 0.4855381495921697, 'attention_dropout_prob': 0.4467636640833267, 'hidden_dropout': 0.22779719425671316, 'hidden_dropout_

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaModel: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training loss after 200 training steps: 0.09916034602560103
Training loss after 400 training steps: 0.09317873511929065
Training loss after 600 training steps: 0.08753768481624623
Training loss after 800 training steps: 0.08418421996524557
Training loss after 1000 training steps: 0.08278124459460377
Training loss after 1200 training steps: 0.08138871290100118
validation acc for epoch 0: 0.9113036303630363
Training loss after 200 training steps: 0.06813662331085651
Training loss after 400 training steps: 0.06975432363105938
Training loss after 600 training steps: 0.06852088726358488
Training loss after 800 training steps: 0.06892001709609757
Training loss after 1000 training steps: 0.06822012529755012
Training loss after 1200 training steps: 0.06797260260325856
validation acc for epoch 1: 0.9055280528052805
{'layer_norm_eps': 8.062777030495783e-05, 'attention_dropout': 0.4855381495921697, 'attention_dropout_prob': 0.4467636640833267, 'hidden_dropout': 0.22779719425671316, 'hidden_dropou

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaModel: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training loss after 200 training steps: 0.10840583121404052
Training loss after 400 training steps: 0.09940199424512684
Training loss after 600 training steps: 0.09284369695621232
Training loss after 800 training steps: 0.08910035797045567
Training loss after 1000 training steps: 0.08651138073019683
Training loss after 1200 training steps: 0.08511691431282088
validation acc for epoch 0: 0.9018151815181518
Training loss after 200 training steps: 0.06828231587540358
Training loss after 400 training steps: 0.07107375487685204
Training loss after 600 training steps: 0.07077041277661919
Training loss after 800 training steps: 0.071349931077566
Training loss after 1000 training steps: 0.07006483095698059
Training loss after 1200 training steps: 0.06919574194781793
validation acc for epoch 1: 0.898102310231023
{'layer_norm_eps': 8.062777030495783e-05, 'attention_dropout': 0.4855381495921697, 'attention_dropout_prob': 0.4467636640833267, 'hidden_dropout': 0.22779719425671316, 'hidden_dropout_p

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaModel: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training loss after 200 training steps: 0.09948994142003358
Training loss after 400 training steps: 0.09061214630492032
Training loss after 600 training steps: 0.08815529856830835
Training loss after 800 training steps: 0.08525891138473525
Training loss after 1000 training steps: 0.08347475728206337
Training loss after 1200 training steps: 0.08117030708429714
validation acc for epoch 0: 0.9055280528052805
Training loss after 200 training steps: 0.06649654109496624
Training loss after 400 training steps: 0.06938131520524621
Training loss after 600 training steps: 0.06801595942505324
Training loss after 800 training steps: 0.06712832045159303
Training loss after 1000 training steps: 0.06692503215558827
Training loss after 1200 training steps: 0.06655393493594602


[32m[I 2022-10-04 21:09:52,297][0m Trial 6 finished with value: 0.03516959551634981 and parameters: {'layer_norm_eps': 8.062777030495783e-05, 'attention_dropout': 0.4855381495921697, 'attention_dropout_prob': 0.4467636640833267, 'hidden_dropout': 0.22779719425671316, 'hidden_dropout_prob': 0.2652048337141848, 'last_layer_id': -2, 'learning_rate': 7.147419945425295e-06}. Best is trial 5 with value: 0.02980479276846639.[0m


validation acc for epoch 1: 0.9042904290429042
{'layer_norm_eps': 7.608310991151871e-05, 'attention_dropout': 0.2808706151117537, 'attention_dropout_prob': 0.13711302895034547, 'hidden_dropout': 0.33438628848911534, 'hidden_dropout_prob': 0.41812717114401143, 'last_layer_id': -1, 'learning_rate': 5.162688235743372e-06}


Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaModel: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training loss after 200 training steps: 0.11043811371549964
Training loss after 400 training steps: 0.10311910153366625
Training loss after 600 training steps: 0.0994136209289233
Training loss after 800 training steps: 0.09678489528829232
Training loss after 1000 training steps: 0.09503364639729261
Training loss after 1200 training steps: 0.09335213714589675
