In [1]:
# imports
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch
from torch.utils.data import Dataset
from datasets import load_dataset
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Load Emotions Dataset

In [2]:
emo_train = load_dataset("csv", data_files="./Swahili Emotion Data/emo_train.csv", encoding = "ISO-8859-1")
emo_valid = load_dataset("csv", data_files="./Swahili Emotion Data/emo_valid.csv", encoding = "ISO-8859-1")
emo_test = load_dataset("csv", data_files="./Swahili Emotion Data/emo_test.csv", encoding = "ISO-8859-1")

In [3]:
#  	0 - neutral
#	1 - joy (furaha)
#	2 - anger (hasira)
#	3 - sadness (huzuni)
#	4 - disgust (machukizo)
#	5 - suprise (mshangao)
#	6 - fear (woga)


classes = ['neutral','joy','anger','sadness','disgust','suprise','fear']
class2id = {class_:id for id, class_ in enumerate(classes)}
id2class = {id:class_ for class_, id in class2id.items()}

## Load Pre-Trained Model
### AfriBerta

In [4]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("castorini/afriberta_base")
model = AutoModelForTokenClassification.from_pretrained("castorini/afriberta_base", num_labels=len(classes),id2label=id2class, label2id=class2id,problem_type = "multi_label_classification")

tokenizer.model_max_length = 512 

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at castorini/afriberta_base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Preprocess Data

In [5]:
# Preprocessing the Emotion Dataset using the Trainer

import ast

def process_label(data):
    lables = []
    for element in data['labels']:
        element = element.replace("[","")
        element = element.replace("]","")
        element = element.replace(" ","")
        element_list = element.split(",")
        label_list = [int(item) for item in element_list]
        lables.append(label_list)
    data['labels'] = lables
    return data

tokenised_trained_emotion = emo_train.map(process_label, batched=True)
tokenised_valid_emotion = emo_valid.map(process_label, batched=True)
tokenised_test_emotion = emo_test.map(process_label, batched=True)


def tokenize_function(data):
    all_labels = data['labels']
    labels = [0. for i in range(len(classes))]
    for label in all_labels:
        label_id = label
        labels[label_id] = 1.
    
    data = tokenizer(data["text"], padding = "max_length", truncation=True)
    data['labels'] = labels
    return data


tokenised_trained_emotion = tokenised_trained_emotion.map(tokenize_function)
tokenised_valid_emotion = tokenised_valid_emotion.map(tokenize_function)
tokenised_test_emotion = tokenised_test_emotion.map(tokenize_function)

tokenised_trained_emotion = tokenised_trained_emotion.rename_column("labels","label")
tokenised_valid_emotion = tokenised_valid_emotion.rename_column("labels","label")
tokenised_test_emotion = tokenised_test_emotion.rename_column("labels","label")

#print(tokenised_trained_emotion['train']['label'])

    
#small_train_dataset = tokenised_trained_emotion["train"].shuffle(seed=42).select(range(1000))
#small_eval_dataset = tokenised_valid_emotion["train"].shuffle(seed=42).select(range(1000))


In [6]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [7]:
from transformers import TrainingArguments, Trainer

import evaluate


metric = evaluate.load("accuracy")
clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

def sigmoid(x):
   return 1/(1 + np.exp(-x))

class CustomTrainer(Trainer):
   def compute_loss(self, model, inputs, return_outputs=False):
      outputs = model(
          input_ids=inputs['input_ids'],
          attention_mask=inputs['attention_mask'],
      )
      cls_logits = outputs['logits'][:, 0, :]
      loss = torch.nn.BCEWithLogitsLoss()(cls_logits.float(),
                                       inputs['labels'].float())
      return (loss, outputs) if return_outputs else loss
    

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions[:, 0, :]
    predictions = sigmoid(predictions)
    predictions = (predictions > 0.5).astype(int).reshape(-1)
    #predictions = np.argmax(logits, axis=1)
    return clf_metrics.compute(predictions=predictions, references=labels.astype(int).reshape(-1))


#training_args = TrainingArguments(output_dir="test_trainer", eval_strategy="epoch", learning_rate = 5e-5, adam_epsilon = 1e-8, lr_scheduler_type = "reduce_lr_on_plateau", lr_scheduler_kwargs = {'patience':5} )

training_args = TrainingArguments(
   #output_dir="my_awesome_model",
   #learning_rate=2e-5,
   learning_rate = 5e-5,
   adam_epsilon = 1e-8,
   lr_scheduler_type = "reduce_lr_on_plateau",
   lr_scheduler_kwargs = {'patience':5},
   #per_device_train_batch_size=4,
   #per_device_eval_batch_size=4,
   num_train_epochs=50,
   weight_decay=0.01,
   evaluation_strategy="epoch",
   save_strategy="epoch",
   load_best_model_at_end=True,
)


trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenised_trained_emotion['train'],
    eval_dataset=tokenised_valid_emotion['train'],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


trainer.train()

  1%|          | 500/60850 [02:20<4:42:21,  3.56it/s]

{'loss': 0.3287, 'grad_norm': 4.483614444732666, 'learning_rate': 5e-05, 'epoch': 0.41}


  2%|▏         | 1000/60850 [04:41<4:39:51,  3.56it/s]

{'loss': 0.2757, 'grad_norm': 1.8038887977600098, 'learning_rate': 5e-05, 'epoch': 0.82}


                                                      
Non-default generation parameters: {'max_length': 512}


{'eval_loss': 0.26965609192848206, 'eval_accuracy': 0.894591915409186, 'eval_f1': 0.6139572408229125, 'eval_precision': 0.7059369202226345, 'eval_recall': 0.5431834403997144, 'eval_runtime': 14.0653, 'eval_samples_per_second': 92.213, 'eval_steps_per_second': 11.589, 'epoch': 1.0}


  2%|▏         | 1500/60850 [07:16<4:36:33,  3.58it/s] 

{'loss': 0.2394, 'grad_norm': 3.810812473297119, 'learning_rate': 5e-05, 'epoch': 1.23}


  3%|▎         | 2000/60850 [09:36<4:34:10,  3.58it/s]

{'loss': 0.1969, 'grad_norm': 2.0970964431762695, 'learning_rate': 5e-05, 'epoch': 1.64}


                                                      
Non-default generation parameters: {'max_length': 512}


{'eval_loss': 0.28692805767059326, 'eval_accuracy': 0.8929397510739068, 'eval_f1': 0.6375838926174496, 'eval_precision': 0.667447306791569, 'eval_recall': 0.6102783725910065, 'eval_runtime': 14.0556, 'eval_samples_per_second': 92.277, 'eval_steps_per_second': 11.597, 'epoch': 2.0}


  4%|▍         | 2500/60850 [12:12<4:32:32,  3.57it/s] 

{'loss': 0.1843, 'grad_norm': 3.780313491821289, 'learning_rate': 5e-05, 'epoch': 2.05}


  5%|▍         | 3000/60850 [14:32<4:30:09,  3.57it/s]

{'loss': 0.1183, 'grad_norm': 2.3220269680023193, 'learning_rate': 5e-05, 'epoch': 2.47}


  6%|▌         | 3500/60850 [16:52<4:27:04,  3.58it/s]

{'loss': 0.1284, 'grad_norm': 7.947541236877441, 'learning_rate': 5e-05, 'epoch': 2.88}


                                                      
Non-default generation parameters: {'max_length': 512}


{'eval_loss': 0.3367762863636017, 'eval_accuracy': 0.8919484524727392, 'eval_f1': 0.6285497917455509, 'eval_precision': 0.6693548387096774, 'eval_recall': 0.5924339757316203, 'eval_runtime': 14.0684, 'eval_samples_per_second': 92.193, 'eval_steps_per_second': 11.586, 'epoch': 3.0}


  7%|▋         | 4000/60850 [19:27<4:25:11,  3.57it/s] 

{'loss': 0.0936, 'grad_norm': 2.824280023574829, 'learning_rate': 5e-05, 'epoch': 3.29}


  7%|▋         | 4500/60850 [21:47<4:22:51,  3.57it/s]

{'loss': 0.08, 'grad_norm': 0.7192370891571045, 'learning_rate': 5e-05, 'epoch': 3.7}


                                                      
Non-default generation parameters: {'max_length': 512}


{'eval_loss': 0.40995877981185913, 'eval_accuracy': 0.8848992179755479, 'eval_f1': 0.622607439508848, 'eval_precision': 0.6301169590643275, 'eval_recall': 0.6152748037116346, 'eval_runtime': 14.0489, 'eval_samples_per_second': 92.32, 'eval_steps_per_second': 11.602, 'epoch': 4.0}


  8%|▊         | 5000/60850 [24:23<4:20:14,  3.58it/s] 

{'loss': 0.0764, 'grad_norm': 2.999631404876709, 'learning_rate': 5e-05, 'epoch': 4.11}


  9%|▉         | 5500/60850 [26:43<4:18:02,  3.58it/s]

{'loss': 0.0591, 'grad_norm': 15.180964469909668, 'learning_rate': 5e-05, 'epoch': 4.52}


 10%|▉         | 6000/60850 [29:03<4:15:44,  3.57it/s]

{'loss': 0.0648, 'grad_norm': 4.84954309463501, 'learning_rate': 5e-05, 'epoch': 4.93}


                                                      
Non-default generation parameters: {'max_length': 512}


{'eval_loss': 0.4526340961456299, 'eval_accuracy': 0.8902962881374601, 'eval_f1': 0.6364963503649635, 'eval_precision': 0.6512322628827483, 'eval_recall': 0.622412562455389, 'eval_runtime': 14.0545, 'eval_samples_per_second': 92.284, 'eval_steps_per_second': 11.598, 'epoch': 5.0}


 11%|█         | 6500/60850 [31:38<4:13:45,  3.57it/s] 

{'loss': 0.0466, 'grad_norm': 0.3904878497123718, 'learning_rate': 5e-05, 'epoch': 5.34}


 12%|█▏        | 7000/60850 [33:58<4:11:29,  3.57it/s]

{'loss': 0.0457, 'grad_norm': 1.3068572282791138, 'learning_rate': 5e-05, 'epoch': 5.75}


                                                      
Non-default generation parameters: {'max_length': 512}


{'eval_loss': 0.5687185525894165, 'eval_accuracy': 0.8798325806806917, 'eval_f1': 0.5954764553207267, 'eval_precision': 0.6195987654320988, 'eval_recall': 0.5731620271234832, 'eval_runtime': 14.0602, 'eval_samples_per_second': 92.246, 'eval_steps_per_second': 11.593, 'epoch': 6.0}


 12%|█▏        | 7500/60850 [36:33<4:08:50,  3.57it/s] 

{'loss': 0.0496, 'grad_norm': 1.698649287223816, 'learning_rate': 5e-05, 'epoch': 6.16}


 13%|█▎        | 8000/60850 [38:53<4:06:33,  3.57it/s]

{'loss': 0.0353, 'grad_norm': 0.994245171546936, 'learning_rate': 5e-05, 'epoch': 6.57}


 14%|█▍        | 8500/60850 [41:13<4:03:59,  3.58it/s]

{'loss': 0.0483, 'grad_norm': 3.115760564804077, 'learning_rate': 5e-05, 'epoch': 6.98}


                                                      
Non-default generation parameters: {'max_length': 512}


{'eval_loss': 0.5750183463096619, 'eval_accuracy': 0.8779601277673753, 'eval_f1': 0.5997109826589595, 'eval_precision': 0.6071689831748354, 'eval_recall': 0.5924339757316203, 'eval_runtime': 14.0518, 'eval_samples_per_second': 92.301, 'eval_steps_per_second': 11.6, 'epoch': 7.0}


 15%|█▍        | 9000/60850 [43:48<4:01:25,  3.58it/s] 

{'loss': 0.0219, 'grad_norm': 4.2072014808654785, 'learning_rate': 5e-06, 'epoch': 7.4}


 16%|█▌        | 9500/60850 [46:08<3:59:07,  3.58it/s]

{'loss': 0.0193, 'grad_norm': 0.07604951411485672, 'learning_rate': 5e-06, 'epoch': 7.81}


                                                      
Non-default generation parameters: {'max_length': 512}


{'eval_loss': 0.5514623522758484, 'eval_accuracy': 0.8845687851084921, 'eval_f1': 0.6227501799856011, 'eval_precision': 0.6281771968046478, 'eval_recall': 0.6174161313347609, 'eval_runtime': 14.0541, 'eval_samples_per_second': 92.286, 'eval_steps_per_second': 11.598, 'epoch': 8.0}


 16%|█▋        | 10000/60850 [48:44<3:56:54,  3.58it/s]

{'loss': 0.0137, 'grad_norm': 0.07821623235940933, 'learning_rate': 5e-06, 'epoch': 8.22}


 17%|█▋        | 10500/60850 [51:04<3:54:28,  3.58it/s]

{'loss': 0.0114, 'grad_norm': 1.824655294418335, 'learning_rate': 5e-06, 'epoch': 8.63}


                                                       
Non-default generation parameters: {'max_length': 512}


{'eval_loss': 0.5634276270866394, 'eval_accuracy': 0.8852296508426039, 'eval_f1': 0.624910007199424, 'eval_precision': 0.6303558460421206, 'eval_recall': 0.6195574589578873, 'eval_runtime': 14.0478, 'eval_samples_per_second': 92.327, 'eval_steps_per_second': 11.603, 'epoch': 9.0}


 18%|█▊        | 11000/60850 [53:39<3:52:22,  3.58it/s] 

{'loss': 0.0104, 'grad_norm': 0.011912371963262558, 'learning_rate': 5e-06, 'epoch': 9.04}


 19%|█▉        | 11500/60850 [55:59<3:50:08,  3.57it/s]

{'loss': 0.0067, 'grad_norm': 0.08164281398057938, 'learning_rate': 5e-06, 'epoch': 9.45}


 20%|█▉        | 12000/60850 [58:19<3:47:42,  3.58it/s]

{'loss': 0.0057, 'grad_norm': 0.020285628736019135, 'learning_rate': 5e-06, 'epoch': 9.86}


                                                       
Non-default generation parameters: {'max_length': 512}


{'eval_loss': 0.5772406458854675, 'eval_accuracy': 0.8844586408194736, 'eval_f1': 0.621436304583183, 'eval_precision': 0.6284671532846715, 'eval_recall': 0.6145610278372591, 'eval_runtime': 14.0522, 'eval_samples_per_second': 92.299, 'eval_steps_per_second': 11.6, 'epoch': 10.0}


 21%|██        | 12500/60850 [1:00:54<3:45:25,  3.57it/s]

{'loss': 0.0061, 'grad_norm': 0.04183290898799896, 'learning_rate': 5e-06, 'epoch': 10.27}


 21%|██▏       | 13000/60850 [1:03:14<3:43:21,  3.57it/s]

{'loss': 0.0046, 'grad_norm': 0.027263857424259186, 'learning_rate': 5e-06, 'epoch': 10.68}


                                                         
Non-default generation parameters: {'max_length': 512}


{'eval_loss': 0.6034977436065674, 'eval_accuracy': 0.8847890736865294, 'eval_f1': 0.6242816091954023, 'eval_precision': 0.6283441793203182, 'eval_recall': 0.6202712348322627, 'eval_runtime': 14.0632, 'eval_samples_per_second': 92.227, 'eval_steps_per_second': 11.591, 'epoch': 11.0}


 22%|██▏       | 13500/60850 [1:05:49<3:40:39,  3.58it/s] 

{'loss': 0.0037, 'grad_norm': 0.017546843737363815, 'learning_rate': 5e-06, 'epoch': 11.09}


 23%|██▎       | 14000/60850 [1:08:09<3:38:19,  3.58it/s]

{'loss': 0.0035, 'grad_norm': 0.07071901112794876, 'learning_rate': 5e-06, 'epoch': 11.5}


 24%|██▍       | 14500/60850 [1:10:29<3:35:58,  3.58it/s]

{'loss': 0.0042, 'grad_norm': 0.06451301276683807, 'learning_rate': 5e-06, 'epoch': 11.91}


                                                         
Non-default generation parameters: {'max_length': 512}


{'eval_loss': 0.6125440001487732, 'eval_accuracy': 0.8851195065535852, 'eval_f1': 0.6244148361541232, 'eval_precision': 0.6300872093023255, 'eval_recall': 0.6188436830835118, 'eval_runtime': 14.0594, 'eval_samples_per_second': 92.251, 'eval_steps_per_second': 11.594, 'epoch': 12.0}


 25%|██▍       | 15000/60850 [1:13:05<3:33:58,  3.57it/s] 

{'loss': 0.0025, 'grad_norm': 0.017443569377064705, 'learning_rate': 5e-06, 'epoch': 12.33}


 25%|██▌       | 15500/60850 [1:15:25<3:31:17,  3.58it/s]

{'loss': 0.0023, 'grad_norm': 0.03938056156039238, 'learning_rate': 5e-06, 'epoch': 12.74}


                                                         
Non-default generation parameters: {'max_length': 512}


{'eval_loss': 0.6333132982254028, 'eval_accuracy': 0.8882035466461063, 'eval_f1': 0.6318462096481683, 'eval_precision': 0.6423303834808259, 'eval_recall': 0.6216987865810135, 'eval_runtime': 14.0531, 'eval_samples_per_second': 92.293, 'eval_steps_per_second': 11.599, 'epoch': 13.0}


 26%|██▋       | 16000/60850 [1:18:00<3:29:24,  3.57it/s] 

{'loss': 0.0028, 'grad_norm': 0.006894456688314676, 'learning_rate': 5.000000000000001e-07, 'epoch': 13.15}


 27%|██▋       | 16500/60850 [1:20:20<3:26:52,  3.57it/s]

{'loss': 0.0015, 'grad_norm': 0.06496504694223404, 'learning_rate': 5.000000000000001e-07, 'epoch': 13.56}


 28%|██▊       | 17000/60850 [1:22:40<3:24:30,  3.57it/s]

{'loss': 0.0019, 'grad_norm': 4.68906307220459, 'learning_rate': 5.000000000000001e-07, 'epoch': 13.97}


                                                         
Non-default generation parameters: {'max_length': 512}


{'eval_loss': 0.6332783699035645, 'eval_accuracy': 0.8850093622645666, 'eval_f1': 0.6225596529284165, 'eval_precision': 0.6307692307692307, 'eval_recall': 0.6145610278372591, 'eval_runtime': 14.0491, 'eval_samples_per_second': 92.319, 'eval_steps_per_second': 11.602, 'epoch': 14.0}


 29%|██▉       | 17500/60850 [1:25:15<3:21:57,  3.58it/s] 

{'loss': 0.0017, 'grad_norm': 0.01241094246506691, 'learning_rate': 5.000000000000001e-07, 'epoch': 14.38}


 30%|██▉       | 18000/60850 [1:27:35<3:19:32,  3.58it/s]

{'loss': 0.0015, 'grad_norm': 0.01210253406316042, 'learning_rate': 5.000000000000001e-07, 'epoch': 14.79}


                                                         
Non-default generation parameters: {'max_length': 512}


{'eval_loss': 0.6352468729019165, 'eval_accuracy': 0.8855600837096597, 'eval_f1': 0.6239594643503439, 'eval_precision': 0.6328928046989721, 'eval_recall': 0.6152748037116346, 'eval_runtime': 14.0484, 'eval_samples_per_second': 92.323, 'eval_steps_per_second': 11.603, 'epoch': 15.0}


 30%|███       | 18500/60850 [1:30:10<3:17:33,  3.57it/s] 

{'loss': 0.0022, 'grad_norm': 0.04583275318145752, 'learning_rate': 5.000000000000001e-07, 'epoch': 15.2}


 31%|███       | 19000/60850 [1:32:30<3:15:10,  3.57it/s]

{'loss': 0.0012, 'grad_norm': 0.03589851036667824, 'learning_rate': 5.000000000000001e-07, 'epoch': 15.61}


                                                         
Non-default generation parameters: {'max_length': 512}


{'eval_loss': 0.638202428817749, 'eval_accuracy': 0.8852296508426039, 'eval_f1': 0.6232827187274042, 'eval_precision': 0.6315018315018315, 'eval_recall': 0.6152748037116346, 'eval_runtime': 14.0647, 'eval_samples_per_second': 92.217, 'eval_steps_per_second': 11.589, 'epoch': 16.0}


 32%|███▏      | 19500/60850 [1:35:06<3:13:11,  3.57it/s] 

{'loss': 0.0013, 'grad_norm': 0.01646898128092289, 'learning_rate': 5.000000000000001e-07, 'epoch': 16.02}


 33%|███▎      | 20000/60850 [1:37:26<3:10:52,  3.57it/s]

{'loss': 0.0014, 'grad_norm': 0.011469099670648575, 'learning_rate': 5.000000000000001e-07, 'epoch': 16.43}


 34%|███▎      | 20500/60850 [1:39:46<3:08:10,  3.57it/s]

{'loss': 0.0013, 'grad_norm': 0.009099145419895649, 'learning_rate': 5.000000000000001e-07, 'epoch': 16.84}


                                                         
Non-default generation parameters: {'max_length': 512}


{'eval_loss': 0.6413565278053284, 'eval_accuracy': 0.8845687851084921, 'eval_f1': 0.6219336219336219, 'eval_precision': 0.6287381473377097, 'eval_recall': 0.6152748037116346, 'eval_runtime': 14.0543, 'eval_samples_per_second': 92.285, 'eval_steps_per_second': 11.598, 'epoch': 17.0}


 35%|███▍      | 21000/60850 [1:42:21<3:05:49,  3.57it/s] 

{'loss': 0.0013, 'grad_norm': 0.02742249332368374, 'learning_rate': 5.000000000000001e-07, 'epoch': 17.26}


 35%|███▌      | 21500/60850 [1:44:41<3:03:41,  3.57it/s]

{'loss': 0.0015, 'grad_norm': 0.019483599811792374, 'learning_rate': 5.000000000000001e-07, 'epoch': 17.67}


                                                         
Non-default generation parameters: {'max_length': 512}


{'eval_loss': 0.6444230079650879, 'eval_accuracy': 0.8853397951316224, 'eval_f1': 0.6243233489714904, 'eval_precision': 0.6313868613138686, 'eval_recall': 0.6174161313347609, 'eval_runtime': 14.0693, 'eval_samples_per_second': 92.186, 'eval_steps_per_second': 11.585, 'epoch': 18.0}


 36%|███▌      | 22000/60850 [1:47:16<3:01:05,  3.58it/s] 

{'loss': 0.001, 'grad_norm': 0.35477215051651, 'learning_rate': 5.000000000000001e-07, 'epoch': 18.08}


 37%|███▋      | 22500/60850 [1:49:36<2:58:48,  3.57it/s]

{'loss': 0.0014, 'grad_norm': 0.04360060766339302, 'learning_rate': 5.000000000000001e-07, 'epoch': 18.49}


 38%|███▊      | 23000/60850 [1:51:56<2:56:51,  3.57it/s]

{'loss': 0.0012, 'grad_norm': 0.020765241235494614, 'learning_rate': 5.000000000000001e-07, 'epoch': 18.9}


                                                         
Non-default generation parameters: {'max_length': 512}


{'eval_loss': 0.6476017236709595, 'eval_accuracy': 0.8842383522414363, 'eval_f1': 0.6220783890686803, 'eval_precision': 0.6268115942028986, 'eval_recall': 0.6174161313347609, 'eval_runtime': 14.0523, 'eval_samples_per_second': 92.298, 'eval_steps_per_second': 11.599, 'epoch': 19.0}


 39%|███▊      | 23500/60850 [1:54:31<2:54:07,  3.58it/s] 

{'loss': 0.0012, 'grad_norm': 0.039727166295051575, 'learning_rate': 5.000000000000001e-08, 'epoch': 19.31}


 39%|███▉      | 24000/60850 [1:56:51<2:51:48,  3.57it/s]

{'loss': 0.0009, 'grad_norm': 0.0226200632750988, 'learning_rate': 5.000000000000001e-08, 'epoch': 19.72}


                                                         
Non-default generation parameters: {'max_length': 512}


{'eval_loss': 0.6478744745254517, 'eval_accuracy': 0.8843484965304549, 'eval_f1': 0.6223021582733813, 'eval_precision': 0.6272661348803481, 'eval_recall': 0.6174161313347609, 'eval_runtime': 14.0669, 'eval_samples_per_second': 92.203, 'eval_steps_per_second': 11.588, 'epoch': 20.0}


 40%|████      | 24500/60850 [1:59:27<2:49:39,  3.57it/s] 

{'loss': 0.001, 'grad_norm': 0.00831972248852253, 'learning_rate': 5.000000000000001e-08, 'epoch': 20.13}


 41%|████      | 25000/60850 [2:01:47<2:46:58,  3.58it/s]

{'loss': 0.0013, 'grad_norm': 0.13959477841854095, 'learning_rate': 5.000000000000001e-08, 'epoch': 20.54}


 42%|████▏     | 25500/60850 [2:04:07<2:44:45,  3.58it/s]

{'loss': 0.0009, 'grad_norm': 0.07023140043020248, 'learning_rate': 5.000000000000001e-08, 'epoch': 20.95}


                                                         
Non-default generation parameters: {'max_length': 512}


{'eval_loss': 0.6482683420181274, 'eval_accuracy': 0.8851195065535852, 'eval_f1': 0.6241441441441441, 'eval_precision': 0.6302765647743813, 'eval_recall': 0.6181299072091363, 'eval_runtime': 14.0557, 'eval_samples_per_second': 92.276, 'eval_steps_per_second': 11.597, 'epoch': 21.0}


 43%|████▎     | 26000/60850 [2:06:42<2:42:40,  3.57it/s] 

{'loss': 0.0011, 'grad_norm': 0.012176849879324436, 'learning_rate': 5.000000000000001e-08, 'epoch': 21.36}


 44%|████▎     | 26500/60850 [2:09:02<2:40:27,  3.57it/s]

{'loss': 0.001, 'grad_norm': 0.01256045326590538, 'learning_rate': 5.000000000000001e-08, 'epoch': 21.77}


                                                         
Non-default generation parameters: {'max_length': 512}


{'eval_loss': 0.6486040353775024, 'eval_accuracy': 0.885449939420641, 'eval_f1': 0.6253602305475504, 'eval_precision': 0.6312727272727273, 'eval_recall': 0.6195574589578873, 'eval_runtime': 14.0437, 'eval_samples_per_second': 92.354, 'eval_steps_per_second': 11.607, 'epoch': 22.0}


 44%|████▍     | 27000/60850 [2:11:37<2:37:51,  3.57it/s] 

{'loss': 0.001, 'grad_norm': 0.010223963297903538, 'learning_rate': 5.000000000000001e-08, 'epoch': 22.19}


 45%|████▌     | 27500/60850 [2:13:57<2:35:31,  3.57it/s]

{'loss': 0.0012, 'grad_norm': 0.0031769757624715567, 'learning_rate': 5.000000000000001e-08, 'epoch': 22.6}


                                                         
Non-default generation parameters: {'max_length': 512}


{'eval_loss': 0.6486660838127136, 'eval_accuracy': 0.8855600837096597, 'eval_f1': 0.6255855855855856, 'eval_precision': 0.6317321688500728, 'eval_recall': 0.6195574589578873, 'eval_runtime': 14.0522, 'eval_samples_per_second': 92.299, 'eval_steps_per_second': 11.6, 'epoch': 23.0}


 46%|████▌     | 28000/60850 [2:16:32<4:58:57,  1.83it/s] 

{'loss': 0.0012, 'grad_norm': 0.011628181673586369, 'learning_rate': 5.000000000000001e-08, 'epoch': 23.01}


 47%|████▋     | 28500/60850 [2:18:52<2:30:48,  3.58it/s]

{'loss': 0.001, 'grad_norm': 0.3653445243835449, 'learning_rate': 5.000000000000001e-08, 'epoch': 23.42}


 48%|████▊     | 29000/60850 [2:21:12<2:28:57,  3.56it/s]

{'loss': 0.001, 'grad_norm': 0.016668319702148438, 'learning_rate': 5.000000000000001e-08, 'epoch': 23.83}


                                                         
Non-default generation parameters: {'max_length': 512}


{'eval_loss': 0.6490308046340942, 'eval_accuracy': 0.8855600837096597, 'eval_f1': 0.6255855855855856, 'eval_precision': 0.6317321688500728, 'eval_recall': 0.6195574589578873, 'eval_runtime': 14.0467, 'eval_samples_per_second': 92.335, 'eval_steps_per_second': 11.604, 'epoch': 24.0}


 48%|████▊     | 29500/60850 [2:23:48<2:26:36,  3.56it/s] 

{'loss': 0.001, 'grad_norm': 0.00929506216198206, 'learning_rate': 5.000000000000001e-08, 'epoch': 24.24}


 49%|████▉     | 30000/60850 [2:26:08<2:23:49,  3.57it/s]

{'loss': 0.0011, 'grad_norm': 0.040915604680776596, 'learning_rate': 5.000000000000001e-08, 'epoch': 24.65}


                                                         
Non-default generation parameters: {'max_length': 512}


{'eval_loss': 0.6491663455963135, 'eval_accuracy': 0.8855600837096597, 'eval_f1': 0.6253155427335017, 'eval_precision': 0.6319241982507289, 'eval_recall': 0.6188436830835118, 'eval_runtime': 14.0582, 'eval_samples_per_second': 92.259, 'eval_steps_per_second': 11.595, 'epoch': 25.0}


 50%|█████     | 30500/60850 [2:28:43<2:21:36,  3.57it/s] 

{'loss': 0.0012, 'grad_norm': 0.06308461725711823, 'learning_rate': 5.000000000000002e-09, 'epoch': 25.06}


 51%|█████     | 31000/60850 [2:31:03<2:19:02,  3.58it/s]

{'loss': 0.0012, 'grad_norm': 0.00549336289986968, 'learning_rate': 5.000000000000002e-09, 'epoch': 25.47}


 52%|█████▏    | 31500/60850 [2:33:23<2:16:52,  3.57it/s]

{'loss': 0.0009, 'grad_norm': 0.003279942087829113, 'learning_rate': 5.000000000000002e-09, 'epoch': 25.88}


                                                         
Non-default generation parameters: {'max_length': 512}


{'eval_loss': 0.6492221355438232, 'eval_accuracy': 0.8855600837096597, 'eval_f1': 0.6253155427335017, 'eval_precision': 0.6319241982507289, 'eval_recall': 0.6188436830835118, 'eval_runtime': 14.0523, 'eval_samples_per_second': 92.298, 'eval_steps_per_second': 11.6, 'epoch': 26.0}


 53%|█████▎    | 32000/60850 [2:35:58<2:14:26,  3.58it/s] 

{'loss': 0.001, 'grad_norm': 0.007808650843799114, 'learning_rate': 5.000000000000002e-09, 'epoch': 26.29}


 53%|█████▎    | 32500/60850 [2:38:18<2:12:15,  3.57it/s]

{'loss': 0.001, 'grad_norm': 0.09849510341882706, 'learning_rate': 5.000000000000002e-09, 'epoch': 26.71}


                                                         
Non-default generation parameters: {'max_length': 512}


{'eval_loss': 0.649262011051178, 'eval_accuracy': 0.8855600837096597, 'eval_f1': 0.6253155427335017, 'eval_precision': 0.6319241982507289, 'eval_recall': 0.6188436830835118, 'eval_runtime': 14.0513, 'eval_samples_per_second': 92.305, 'eval_steps_per_second': 11.6, 'epoch': 27.0}


 54%|█████▍    | 33000/60850 [2:40:53<2:09:51,  3.57it/s] 

{'loss': 0.0012, 'grad_norm': 0.014730062335729599, 'learning_rate': 5.000000000000002e-09, 'epoch': 27.12}


 55%|█████▌    | 33500/60850 [2:43:13<2:07:36,  3.57it/s]

{'loss': 0.0009, 'grad_norm': 0.089332215487957, 'learning_rate': 5.000000000000002e-09, 'epoch': 27.53}


 56%|█████▌    | 34000/60850 [2:45:33<2:05:09,  3.58it/s]

{'loss': 0.0011, 'grad_norm': 0.009167611598968506, 'learning_rate': 5.000000000000002e-09, 'epoch': 27.94}


                                                         
Non-default generation parameters: {'max_length': 512}


{'eval_loss': 0.649284839630127, 'eval_accuracy': 0.8855600837096597, 'eval_f1': 0.6253155427335017, 'eval_precision': 0.6319241982507289, 'eval_recall': 0.6188436830835118, 'eval_runtime': 14.047, 'eval_samples_per_second': 92.333, 'eval_steps_per_second': 11.604, 'epoch': 28.0}


 57%|█████▋    | 34500/60850 [2:48:08<2:02:57,  3.57it/s] 

{'loss': 0.0013, 'grad_norm': 0.01488443836569786, 'learning_rate': 5.000000000000002e-09, 'epoch': 28.35}


 58%|█████▊    | 35000/60850 [2:50:28<2:00:43,  3.57it/s]

{'loss': 0.0008, 'grad_norm': 0.1749149113893509, 'learning_rate': 5.000000000000002e-09, 'epoch': 28.76}


                                                         
Non-default generation parameters: {'max_length': 512}


{'eval_loss': 0.64931720495224, 'eval_accuracy': 0.8855600837096597, 'eval_f1': 0.6253155427335017, 'eval_precision': 0.6319241982507289, 'eval_recall': 0.6188436830835118, 'eval_runtime': 14.05, 'eval_samples_per_second': 92.313, 'eval_steps_per_second': 11.601, 'epoch': 29.0}


 58%|█████▊    | 35500/60850 [2:53:04<1:58:14,  3.57it/s] 

{'loss': 0.0012, 'grad_norm': 0.04326556622982025, 'learning_rate': 5.000000000000002e-09, 'epoch': 29.17}


 59%|█████▉    | 36000/60850 [2:55:24<1:55:48,  3.58it/s]

{'loss': 0.0014, 'grad_norm': 0.014864982105791569, 'learning_rate': 5.000000000000002e-09, 'epoch': 29.58}


 60%|█████▉    | 36500/60850 [2:57:44<1:53:59,  3.56it/s]

{'loss': 0.001, 'grad_norm': 0.2622350752353668, 'learning_rate': 5.000000000000002e-09, 'epoch': 29.99}


                                                         
Non-default generation parameters: {'max_length': 512}


{'eval_loss': 0.6493499875068665, 'eval_accuracy': 0.8855600837096597, 'eval_f1': 0.6253155427335017, 'eval_precision': 0.6319241982507289, 'eval_recall': 0.6188436830835118, 'eval_runtime': 14.1864, 'eval_samples_per_second': 91.425, 'eval_steps_per_second': 11.49, 'epoch': 30.0}


 61%|██████    | 37000/60850 [3:00:21<1:51:57,  3.55it/s] 

{'loss': 0.0012, 'grad_norm': 0.004503257572650909, 'learning_rate': 5.000000000000002e-09, 'epoch': 30.4}


 62%|██████▏   | 37500/60850 [3:02:41<1:49:15,  3.56it/s]

{'loss': 0.001, 'grad_norm': 0.037643659859895706, 'learning_rate': 5.000000000000002e-09, 'epoch': 30.81}


                                                         
Non-default generation parameters: {'max_length': 512}


{'eval_loss': 0.6493815779685974, 'eval_accuracy': 0.8855600837096597, 'eval_f1': 0.6253155427335017, 'eval_precision': 0.6319241982507289, 'eval_recall': 0.6188436830835118, 'eval_runtime': 14.2829, 'eval_samples_per_second': 90.808, 'eval_steps_per_second': 11.412, 'epoch': 31.0}


 62%|██████▏   | 38000/60850 [3:05:19<1:47:09,  3.55it/s] 

{'loss': 0.0012, 'grad_norm': 0.01565368101000786, 'learning_rate': 5.000000000000002e-09, 'epoch': 31.22}


 63%|██████▎   | 38500/60850 [3:07:39<1:44:16,  3.57it/s]

{'loss': 0.001, 'grad_norm': 0.5900906324386597, 'learning_rate': 5.000000000000002e-09, 'epoch': 31.64}


                                                         
Non-default generation parameters: {'max_length': 512}


{'eval_loss': 0.6494014263153076, 'eval_accuracy': 0.8855600837096597, 'eval_f1': 0.6253155427335017, 'eval_precision': 0.6319241982507289, 'eval_recall': 0.6188436830835118, 'eval_runtime': 14.0585, 'eval_samples_per_second': 92.258, 'eval_steps_per_second': 11.594, 'epoch': 32.0}


 64%|██████▍   | 39000/60850 [3:10:14<1:41:45,  3.58it/s] 

{'loss': 0.0012, 'grad_norm': 0.0036233174614608288, 'learning_rate': 5.000000000000002e-09, 'epoch': 32.05}


 65%|██████▍   | 39500/60850 [3:12:34<1:39:36,  3.57it/s]

{'loss': 0.0012, 'grad_norm': 0.037546612322330475, 'learning_rate': 5.000000000000002e-09, 'epoch': 32.46}


 66%|██████▌   | 40000/60850 [3:14:54<1:37:06,  3.58it/s]

{'loss': 0.0009, 'grad_norm': 0.01845630444586277, 'learning_rate': 5.000000000000002e-09, 'epoch': 32.87}


                                                         
Non-default generation parameters: {'max_length': 512}


{'eval_loss': 0.6494268774986267, 'eval_accuracy': 0.885449939420641, 'eval_f1': 0.6248196248196248, 'eval_precision': 0.6316557257476295, 'eval_recall': 0.6181299072091363, 'eval_runtime': 14.0577, 'eval_samples_per_second': 92.263, 'eval_steps_per_second': 11.595, 'epoch': 33.0}


 67%|██████▋   | 40500/60850 [3:17:30<1:34:49,  3.58it/s] 

{'loss': 0.0012, 'grad_norm': 0.043023236095905304, 'learning_rate': 5.000000000000002e-09, 'epoch': 33.28}


 67%|██████▋   | 41000/60850 [3:19:50<1:32:43,  3.57it/s]

{'loss': 0.001, 'grad_norm': 0.017633002251386642, 'learning_rate': 5.000000000000002e-09, 'epoch': 33.69}


                                                         
Non-default generation parameters: {'max_length': 512}


{'eval_loss': 0.649466872215271, 'eval_accuracy': 0.885449939420641, 'eval_f1': 0.6248196248196248, 'eval_precision': 0.6316557257476295, 'eval_recall': 0.6181299072091363, 'eval_runtime': 14.0471, 'eval_samples_per_second': 92.332, 'eval_steps_per_second': 11.604, 'epoch': 34.0}


 68%|██████▊   | 41500/60850 [3:22:25<1:30:13,  3.57it/s] 

{'loss': 0.0009, 'grad_norm': 0.010677078738808632, 'learning_rate': 5.000000000000002e-09, 'epoch': 34.1}


 69%|██████▉   | 42000/60850 [3:24:45<1:27:56,  3.57it/s]

{'loss': 0.0013, 'grad_norm': 0.08315624296665192, 'learning_rate': 5.000000000000002e-09, 'epoch': 34.51}


 70%|██████▉   | 42500/60850 [3:27:05<1:25:32,  3.58it/s]

{'loss': 0.001, 'grad_norm': 0.0076479376293718815, 'learning_rate': 5.000000000000002e-09, 'epoch': 34.92}


                                                         
Non-default generation parameters: {'max_length': 512}


{'eval_loss': 0.6494995355606079, 'eval_accuracy': 0.885449939420641, 'eval_f1': 0.6248196248196248, 'eval_precision': 0.6316557257476295, 'eval_recall': 0.6181299072091363, 'eval_runtime': 14.0627, 'eval_samples_per_second': 92.23, 'eval_steps_per_second': 11.591, 'epoch': 35.0}


 71%|███████   | 43000/60850 [3:29:40<1:23:12,  3.58it/s] 

{'loss': 0.0009, 'grad_norm': 0.008798464201390743, 'learning_rate': 5.000000000000002e-09, 'epoch': 35.33}


 71%|███████▏  | 43500/60850 [3:32:00<1:20:48,  3.58it/s]

{'loss': 0.0011, 'grad_norm': 0.02672661654651165, 'learning_rate': 5.000000000000002e-09, 'epoch': 35.74}


                                                         
Non-default generation parameters: {'max_length': 512}


{'eval_loss': 0.6495476961135864, 'eval_accuracy': 0.8855600837096597, 'eval_f1': 0.6253155427335017, 'eval_precision': 0.6319241982507289, 'eval_recall': 0.6188436830835118, 'eval_runtime': 14.0585, 'eval_samples_per_second': 92.258, 'eval_steps_per_second': 11.594, 'epoch': 36.0}


 72%|███████▏  | 44000/60850 [3:34:35<1:18:37,  3.57it/s] 

{'loss': 0.0012, 'grad_norm': 0.6434280276298523, 'learning_rate': 5.000000000000002e-09, 'epoch': 36.15}


 73%|███████▎  | 44500/60850 [3:36:55<1:16:15,  3.57it/s]

{'loss': 0.001, 'grad_norm': 0.009497740305960178, 'learning_rate': 5.000000000000002e-09, 'epoch': 36.57}


 74%|███████▍  | 45000/60850 [3:39:15<1:14:02,  3.57it/s]

{'loss': 0.0011, 'grad_norm': 0.00547595601528883, 'learning_rate': 5.000000000000002e-09, 'epoch': 36.98}


                                                         
Non-default generation parameters: {'max_length': 512}


{'eval_loss': 0.6495803594589233, 'eval_accuracy': 0.885449939420641, 'eval_f1': 0.6248196248196248, 'eval_precision': 0.6316557257476295, 'eval_recall': 0.6181299072091363, 'eval_runtime': 14.0688, 'eval_samples_per_second': 92.19, 'eval_steps_per_second': 11.586, 'epoch': 37.0}


 75%|███████▍  | 45500/60850 [3:41:51<1:11:32,  3.58it/s] 

{'loss': 0.0009, 'grad_norm': 0.08420131355524063, 'learning_rate': 5.000000000000002e-09, 'epoch': 37.39}


 76%|███████▌  | 46000/60850 [3:44:11<1:09:18,  3.57it/s]

{'loss': 0.001, 'grad_norm': 0.15515948832035065, 'learning_rate': 5.000000000000002e-09, 'epoch': 37.8}


                                                         
Non-default generation parameters: {'max_length': 512}


{'eval_loss': 0.6496063470840454, 'eval_accuracy': 0.8855600837096597, 'eval_f1': 0.6253155427335017, 'eval_precision': 0.6319241982507289, 'eval_recall': 0.6188436830835118, 'eval_runtime': 14.0513, 'eval_samples_per_second': 92.304, 'eval_steps_per_second': 11.6, 'epoch': 38.0}


 76%|███████▋  | 46500/60850 [3:46:46<1:07:00,  3.57it/s] 

{'loss': 0.0013, 'grad_norm': 0.007742048241198063, 'learning_rate': 5.000000000000002e-09, 'epoch': 38.21}


 77%|███████▋  | 47000/60850 [3:49:06<1:04:39,  3.57it/s]

{'loss': 0.001, 'grad_norm': 0.004102612379938364, 'learning_rate': 5.000000000000002e-09, 'epoch': 38.62}


                                                         
Non-default generation parameters: {'max_length': 512}


{'eval_loss': 0.6496395468711853, 'eval_accuracy': 0.885449939420641, 'eval_f1': 0.6248196248196248, 'eval_precision': 0.6316557257476295, 'eval_recall': 0.6181299072091363, 'eval_runtime': 14.052, 'eval_samples_per_second': 92.3, 'eval_steps_per_second': 11.6, 'epoch': 39.0}


 78%|███████▊  | 47500/60850 [3:51:41<1:02:14,  3.58it/s] 

{'loss': 0.0011, 'grad_norm': 0.014056633226573467, 'learning_rate': 5.000000000000002e-09, 'epoch': 39.03}


 79%|███████▉  | 48000/60850 [3:54:01<59:55,  3.57it/s]  

{'loss': 0.0009, 'grad_norm': 0.06077985465526581, 'learning_rate': 5.000000000000002e-09, 'epoch': 39.44}


 80%|███████▉  | 48500/60850 [3:56:21<57:34,  3.57it/s]  

{'loss': 0.0013, 'grad_norm': 0.08564631640911102, 'learning_rate': 5.000000000000002e-09, 'epoch': 39.85}


                                                       
Non-default generation parameters: {'max_length': 512}


{'eval_loss': 0.6496725082397461, 'eval_accuracy': 0.8856702279986782, 'eval_f1': 0.6255411255411255, 'eval_precision': 0.6323851203501094, 'eval_recall': 0.6188436830835118, 'eval_runtime': 14.06, 'eval_samples_per_second': 92.247, 'eval_steps_per_second': 11.593, 'epoch': 40.0}


 81%|████████  | 49000/60850 [3:58:57<55:13,  3.58it/s]   

{'loss': 0.001, 'grad_norm': 0.007292252033948898, 'learning_rate': 5.000000000000002e-09, 'epoch': 40.26}


 81%|████████▏ | 49500/60850 [4:01:16<52:57,  3.57it/s]

{'loss': 0.0011, 'grad_norm': 0.007662144023925066, 'learning_rate': 5.000000000000002e-09, 'epoch': 40.67}


                                                       
Non-default generation parameters: {'max_length': 512}


{'eval_loss': 0.6497180461883545, 'eval_accuracy': 0.8856702279986782, 'eval_f1': 0.6255411255411255, 'eval_precision': 0.6323851203501094, 'eval_recall': 0.6188436830835118, 'eval_runtime': 14.0699, 'eval_samples_per_second': 92.183, 'eval_steps_per_second': 11.585, 'epoch': 41.0}


 82%|████████▏ | 50000/60850 [4:03:52<50:42,  3.57it/s]   

{'loss': 0.0009, 'grad_norm': 0.008081360720098019, 'learning_rate': 5.000000000000002e-09, 'epoch': 41.08}


 83%|████████▎ | 50500/60850 [4:06:12<48:17,  3.57it/s]

{'loss': 0.0009, 'grad_norm': 0.007793093100190163, 'learning_rate': 5.000000000000002e-09, 'epoch': 41.5}


 84%|████████▍ | 51000/60850 [4:08:32<46:01,  3.57it/s]

{'loss': 0.0012, 'grad_norm': 0.04658627510070801, 'learning_rate': 5.000000000000002e-09, 'epoch': 41.91}


                                                       
Non-default generation parameters: {'max_length': 512}


{'eval_loss': 0.6497518420219421, 'eval_accuracy': 0.8856702279986782, 'eval_f1': 0.6258111031002163, 'eval_precision': 0.6321922796795338, 'eval_recall': 0.6195574589578873, 'eval_runtime': 14.0807, 'eval_samples_per_second': 92.112, 'eval_steps_per_second': 11.576, 'epoch': 42.0}


 85%|████████▍ | 51500/60850 [4:11:07<43:37,  3.57it/s]   

{'loss': 0.0011, 'grad_norm': 0.0067849489860236645, 'learning_rate': 5.000000000000002e-09, 'epoch': 42.32}


 85%|████████▌ | 52000/60850 [4:13:27<41:19,  3.57it/s]

{'loss': 0.0009, 'grad_norm': 0.008951003663241863, 'learning_rate': 5.000000000000002e-09, 'epoch': 42.73}


                                                       
Non-default generation parameters: {'max_length': 512}


{'eval_loss': 0.6497794389724731, 'eval_accuracy': 0.885449939420641, 'eval_f1': 0.6250901225666907, 'eval_precision': 0.6314639475600874, 'eval_recall': 0.6188436830835118, 'eval_runtime': 14.0649, 'eval_samples_per_second': 92.216, 'eval_steps_per_second': 11.589, 'epoch': 43.0}


 86%|████████▋ | 52500/60850 [4:16:03<38:58,  3.57it/s]   

{'loss': 0.0013, 'grad_norm': 0.16517548263072968, 'learning_rate': 5.000000000000002e-09, 'epoch': 43.14}


 87%|████████▋ | 53000/60850 [4:18:23<36:35,  3.58it/s]

{'loss': 0.001, 'grad_norm': 0.01276401150971651, 'learning_rate': 5.000000000000002e-09, 'epoch': 43.55}


 88%|████████▊ | 53500/60850 [4:20:43<34:15,  3.58it/s]

{'loss': 0.001, 'grad_norm': 0.005910345818847418, 'learning_rate': 5.000000000000002e-09, 'epoch': 43.96}


                                                       
Non-default generation parameters: {'max_length': 512}


{'eval_loss': 0.6497839093208313, 'eval_accuracy': 0.8856702279986782, 'eval_f1': 0.6252707581227437, 'eval_precision': 0.6325785244704164, 'eval_recall': 0.6181299072091363, 'eval_runtime': 14.0943, 'eval_samples_per_second': 92.023, 'eval_steps_per_second': 11.565, 'epoch': 44.0}


 89%|████████▊ | 54000/60850 [4:23:18<31:57,  3.57it/s]  

{'loss': 0.0008, 'grad_norm': 0.0562865175306797, 'learning_rate': 5.000000000000002e-09, 'epoch': 44.37}


 90%|████████▉ | 54500/60850 [4:25:38<29:35,  3.58it/s]

{'loss': 0.0012, 'grad_norm': 0.01589040830731392, 'learning_rate': 5.000000000000002e-09, 'epoch': 44.78}


                                                       
Non-default generation parameters: {'max_length': 512}


{'eval_loss': 0.649806022644043, 'eval_accuracy': 0.8856702279986782, 'eval_f1': 0.6252707581227437, 'eval_precision': 0.6325785244704164, 'eval_recall': 0.6181299072091363, 'eval_runtime': 14.0646, 'eval_samples_per_second': 92.217, 'eval_steps_per_second': 11.589, 'epoch': 45.0}


 90%|█████████ | 55000/60850 [4:28:14<27:17,  3.57it/s]  

{'loss': 0.0009, 'grad_norm': 0.059204667806625366, 'learning_rate': 5.000000000000002e-09, 'epoch': 45.19}


 91%|█████████ | 55500/60850 [4:30:34<25:00,  3.57it/s]

{'loss': 0.001, 'grad_norm': 0.011592683382332325, 'learning_rate': 5.000000000000002e-09, 'epoch': 45.6}


                                                       
Non-default generation parameters: {'max_length': 512}


{'eval_loss': 0.6498254537582397, 'eval_accuracy': 0.8855600837096597, 'eval_f1': 0.6250451100685673, 'eval_precision': 0.6321167883211679, 'eval_recall': 0.6181299072091363, 'eval_runtime': 14.0985, 'eval_samples_per_second': 91.996, 'eval_steps_per_second': 11.562, 'epoch': 46.0}


 92%|█████████▏| 56000/60850 [4:33:09<23:31,  3.44it/s]  

{'loss': 0.0013, 'grad_norm': 0.012492149136960506, 'learning_rate': 5.000000000000002e-09, 'epoch': 46.01}


 93%|█████████▎| 56500/60850 [4:35:29<20:17,  3.57it/s]

{'loss': 0.0012, 'grad_norm': 0.042714234441518784, 'learning_rate': 5.000000000000002e-09, 'epoch': 46.43}


 94%|█████████▎| 57000/60850 [4:37:49<17:59,  3.57it/s]

{'loss': 0.001, 'grad_norm': 0.7834381461143494, 'learning_rate': 5.000000000000002e-09, 'epoch': 46.84}


                                                       
Non-default generation parameters: {'max_length': 512}


{'eval_loss': 0.6498615145683289, 'eval_accuracy': 0.8855600837096597, 'eval_f1': 0.6250451100685673, 'eval_precision': 0.6321167883211679, 'eval_recall': 0.6181299072091363, 'eval_runtime': 14.0804, 'eval_samples_per_second': 92.114, 'eval_steps_per_second': 11.576, 'epoch': 47.0}


 94%|█████████▍| 57500/60850 [4:40:24<15:38,  3.57it/s]  

{'loss': 0.0008, 'grad_norm': 0.014106919057667255, 'learning_rate': 5.000000000000002e-09, 'epoch': 47.25}


 95%|█████████▌| 58000/60850 [4:42:44<13:18,  3.57it/s]

{'loss': 0.0009, 'grad_norm': 0.39534083008766174, 'learning_rate': 5.000000000000002e-09, 'epoch': 47.66}


                                                       
Non-default generation parameters: {'max_length': 512}


{'eval_loss': 0.6498920321464539, 'eval_accuracy': 0.885449939420641, 'eval_f1': 0.6248196248196248, 'eval_precision': 0.6316557257476295, 'eval_recall': 0.6181299072091363, 'eval_runtime': 14.2461, 'eval_samples_per_second': 91.043, 'eval_steps_per_second': 11.442, 'epoch': 48.0}


 96%|█████████▌| 58500/60850 [4:45:21<11:01,  3.55it/s]  

{'loss': 0.0015, 'grad_norm': 0.012698248028755188, 'learning_rate': 5.000000000000002e-09, 'epoch': 48.07}


 97%|█████████▋| 59000/60850 [4:47:42<08:42,  3.54it/s]

{'loss': 0.001, 'grad_norm': 0.014696278609335423, 'learning_rate': 5.000000000000002e-09, 'epoch': 48.48}


 98%|█████████▊| 59500/60850 [4:50:03<06:17,  3.57it/s]

{'loss': 0.001, 'grad_norm': 0.005140350200235844, 'learning_rate': 5.000000000000002e-09, 'epoch': 48.89}


                                                       
Non-default generation parameters: {'max_length': 512}


{'eval_loss': 0.6499161720275879, 'eval_accuracy': 0.885449939420641, 'eval_f1': 0.6248196248196248, 'eval_precision': 0.6316557257476295, 'eval_recall': 0.6181299072091363, 'eval_runtime': 14.1568, 'eval_samples_per_second': 91.617, 'eval_steps_per_second': 11.514, 'epoch': 49.0}


 99%|█████████▊| 60000/60850 [4:52:39<03:57,  3.57it/s]  

{'loss': 0.0009, 'grad_norm': 0.03327944129705429, 'learning_rate': 5.000000000000002e-09, 'epoch': 49.3}


 99%|█████████▉| 60500/60850 [4:54:59<01:38,  3.57it/s]

{'loss': 0.0014, 'grad_norm': 0.004152658395469189, 'learning_rate': 5.000000000000002e-09, 'epoch': 49.71}


Non-default generation parameters: {'max_length': 512}
                                                       
Non-default generation parameters: {'max_length': 512}


{'eval_loss': 0.649949848651886, 'eval_accuracy': 0.885449939420641, 'eval_f1': 0.6248196248196248, 'eval_precision': 0.6316557257476295, 'eval_recall': 0.6181299072091363, 'eval_runtime': 14.0229, 'eval_samples_per_second': 92.491, 'eval_steps_per_second': 11.624, 'epoch': 50.0}


100%|██████████| 60850/60850 [4:56:55<00:00,  3.42it/s]

{'train_runtime': 17815.4471, 'train_samples_per_second': 27.313, 'train_steps_per_second': 3.416, 'train_loss': 0.01882769005310369, 'epoch': 50.0}





TrainOutput(global_step=60850, training_loss=0.01882769005310369, metrics={'train_runtime': 17815.4471, 'train_samples_per_second': 27.313, 'train_steps_per_second': 3.416, 'total_flos': 8.4771947234304e+16, 'train_loss': 0.01882769005310369, 'epoch': 50.0})

In [8]:
tokenizer.save_pretrained("./first_finetuning_model")
model.save_pretrained("./first_finetuning_model")

Non-default generation parameters: {'max_length': 512}
