## Load Data and Tokenizer

In [1]:
from transformers import BertTokenizer
from datasets import load_dataset

ds = load_dataset('billingsmoore/tagged-tibetan-to-english-translation-dataset')

tokenizer = BertTokenizer.from_pretrained('tibetan_tokenizer')

## Preprocess Data

### Use just first two tags

In [2]:
def just_two_tags(examples):
    tags = [tag[:2] for tag in examples['Tags']]
    examples['Tags'] = tags
    return examples

ds = ds.map(just_two_tags, batched=True)

In [3]:
from sklearn.preprocessing import MultiLabelBinarizer


mlb = MultiLabelBinarizer()
labels = mlb.fit(ds['train']['Tags'])  # Fit all unique Tags

# Save label mappings
import json
with open("label_mapping.json", "w") as f:
    json.dump(mlb.classes_.tolist(), f)


In [4]:
def preprocess(examples):
    tokens = tokenizer(examples["Tibetan"], padding="max_length", truncation=True, max_length=128)
    tokens["labels"] =  mlb.transform(examples['Tags']).astype(float).tolist() # Convert labels to multi-hot
    return tokens

encoded_dataset = ds.map(preprocess, batched=True)


In [5]:
encoded_dataset = encoded_dataset.remove_columns(['Tibetan', 'Phonetic', 'English', 'Tags'])

In [6]:
encoded_dataset = encoded_dataset['train'].train_test_split(.15)

In [7]:
enc = tokenizer(ds['train'][0]['Tibetan'])
tokenizer.decode(enc.input_ids)

'[CLS] བ ་ མ ་ དང ་ ལག ་ པའ ་ ལ ་ ལ ་ ཕག ་ འཚལ ་ ལ ། ། [SEP]'

In [8]:
ds['train'][0]

{'Tibetan': 'བླ་མ་དང་ལྷག་པའི་ལྷ་ལ་ཕྱག་འཚལ་ལོ།།',
 'Phonetic': 'lama dang lhakpé lha la chaktsal lo',
 'English': 'Homage to the guru and supreme deity.',
 'Tags': ['Abhidharma', 'Praise']}

## Train Model

In [9]:
from transformers import BertTokenizer, BertForSequenceClassification

# Load tokenizer and model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(mlb.classes_))

# Resize embeddings to match the new tokenizer
model.resize_token_embeddings(len(tokenizer))

# Move model to GPU
model = model.to('cuda:0')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

def compute_metrics(eval_pred):
    predictions, references = eval_pred
    
    # Apply a threshold to convert logits to binary predictions
    predictions = (predictions > 0.5).astype(int)
    
    # Compute metrics
    accuracy = accuracy_score(references, predictions)
    f1 = f1_score(references, predictions, average="micro")
    precision = precision_score(references, predictions, average="micro")
    recall = recall_score(references, predictions, average="micro")

    return {
        "accuracy": accuracy,
        "f1": f1,
        "precision_1": precision,
        "recall": recall,
    }


In [11]:
from transformers import TrainingArguments, Trainer

from transformers import EarlyStoppingCallback

# Define training arguments
training_args = TrainingArguments(
    output_dir="bert-classifier",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=100,  # Set a maximum number of epochs
    weight_decay=0.01,
    eval_strategy="epoch",  # Evaluate at the end of every epoch
    save_strategy="epoch",  # Save the model at the end of every epoch
    load_best_model_at_end=True,  # Load the best model after training
    metric_for_best_model="accuracy",  # Metric to monitor
    greater_is_better=True,  # Higher accuracy is better
    logging_dir="./logs"
)

# Add the EarlyStoppingCallback
early_stopping = EarlyStoppingCallback(
    early_stopping_patience=3  # Stop training if the metric does not improve for 3 evaluation steps
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping]  # Add the early stopping callback
)

# Start training
trainer.train()

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mbillingsmoore[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/571300 [00:00<?, ?it/s]

{'loss': 0.1586, 'grad_norm': 0.06657019257545471, 'learning_rate': 1.9982496061613864e-05, 'epoch': 0.09}
{'loss': 0.0555, 'grad_norm': 0.06579519808292389, 'learning_rate': 1.996499212322773e-05, 'epoch': 0.18}
{'loss': 0.0526, 'grad_norm': 0.0541217103600502, 'learning_rate': 1.994748818484159e-05, 'epoch': 0.26}
{'loss': 0.0522, 'grad_norm': 0.05693612992763519, 'learning_rate': 1.9929984246455454e-05, 'epoch': 0.35}
{'loss': 0.0519, 'grad_norm': 0.06708614528179169, 'learning_rate': 1.9912480308069317e-05, 'epoch': 0.44}
{'loss': 0.0516, 'grad_norm': 0.0570235550403595, 'learning_rate': 1.9894976369683182e-05, 'epoch': 0.53}
{'loss': 0.0517, 'grad_norm': 0.05111713334918022, 'learning_rate': 1.987747243129704e-05, 'epoch': 0.61}
{'loss': 0.0515, 'grad_norm': 0.054363153874874115, 'learning_rate': 1.9859968492910907e-05, 'epoch': 0.7}
{'loss': 0.0516, 'grad_norm': 0.048229996114969254, 'learning_rate': 1.984246455452477e-05, 'epoch': 0.79}
{'loss': 0.0515, 'grad_norm': 0.0492477044

  0%|          | 0/1009 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.05055883154273033, 'eval_accuracy': 0.0, 'eval_f1': 0.0, 'eval_precision_1': 0.0, 'eval_recall': 0.0, 'eval_runtime': 40.2433, 'eval_samples_per_second': 400.787, 'eval_steps_per_second': 25.072, 'epoch': 1.0}
{'loss': 0.0506, 'grad_norm': 0.048681192100048065, 'learning_rate': 1.978995273936636e-05, 'epoch': 1.05}
{'loss': 0.0504, 'grad_norm': 0.04225074127316475, 'learning_rate': 1.9772448800980222e-05, 'epoch': 1.14}
{'loss': 0.05, 'grad_norm': 0.0530262254178524, 'learning_rate': 1.9754944862594084e-05, 'epoch': 1.23}
{'loss': 0.05, 'grad_norm': 0.053983863443136215, 'learning_rate': 1.973744092420795e-05, 'epoch': 1.31}
{'loss': 0.0496, 'grad_norm': 0.2139396369457245, 'learning_rate': 1.9719936985821812e-05, 'epoch': 1.4}
{'loss': 0.0494, 'grad_norm': 0.14534181356430054, 'learning_rate': 1.9702433047435675e-05, 'epoch': 1.49}
{'loss': 0.0484, 'grad_norm': 0.08702977001667023, 'learning_rate': 1.9684929109049537e-05, 'epoch': 1.58}
{'loss': 0.048, 'grad_norm': 0.0

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.04634253308176994, 'eval_accuracy': 0.05363010726021452, 'eval_f1': 0.1068166661834208, 'eval_precision_1': 0.8256387270282385, 'eval_recall': 0.05710211420422841, 'eval_runtime': 40.0776, 'eval_samples_per_second': 402.444, 'eval_steps_per_second': 25.176, 'epoch': 2.0}
{'loss': 0.0468, 'grad_norm': 0.15287017822265625, 'learning_rate': 1.9597409417118852e-05, 'epoch': 2.01}
{'loss': 0.0467, 'grad_norm': 0.0966062918305397, 'learning_rate': 1.9579905478732717e-05, 'epoch': 2.1}
{'loss': 0.0461, 'grad_norm': 0.09703671187162399, 'learning_rate': 1.956240154034658e-05, 'epoch': 2.19}
{'loss': 0.0464, 'grad_norm': 0.06067479029297829, 'learning_rate': 1.9544897601960442e-05, 'epoch': 2.28}
{'loss': 0.046, 'grad_norm': 0.1493440568447113, 'learning_rate': 1.9527393663574304e-05, 'epoch': 2.36}
{'loss': 0.0458, 'grad_norm': 0.07012952119112015, 'learning_rate': 1.950988972518817e-05, 'epoch': 2.45}
{'loss': 0.046, 'grad_norm': 0.06337795406579971, 'learning_rate': 1.9492385

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.04455872252583504, 'eval_accuracy': 0.051398102796205596, 'eval_f1': 0.10220088972137673, 'eval_precision_1': 0.9141361256544502, 'eval_recall': 0.054126108252216504, 'eval_runtime': 40.0529, 'eval_samples_per_second': 402.692, 'eval_steps_per_second': 25.192, 'epoch': 3.0}
{'loss': 0.0449, 'grad_norm': 0.08660844713449478, 'learning_rate': 1.938736215648521e-05, 'epoch': 3.06}
{'loss': 0.045, 'grad_norm': 0.10336534678936005, 'learning_rate': 1.9369858218099072e-05, 'epoch': 3.15}
{'loss': 0.0441, 'grad_norm': 0.09430891275405884, 'learning_rate': 1.9352354279712938e-05, 'epoch': 3.24}
{'loss': 0.0444, 'grad_norm': 0.07690390944480896, 'learning_rate': 1.93348503413268e-05, 'epoch': 3.33}
{'loss': 0.0444, 'grad_norm': 0.2542254626750946, 'learning_rate': 1.9317346402940666e-05, 'epoch': 3.41}
{'loss': 0.0443, 'grad_norm': 0.13074104487895966, 'learning_rate': 1.9299842464554525e-05, 'epoch': 3.5}
{'loss': 0.044, 'grad_norm': 0.1089024543762207, 'learning_rate': 1.92823

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.04313644766807556, 'eval_accuracy': 0.07898815797631595, 'eval_f1': 0.15872484158222372, 'eval_precision_1': 0.7974754558204769, 'eval_recall': 0.08813317626635253, 'eval_runtime': 40.442, 'eval_samples_per_second': 398.818, 'eval_steps_per_second': 24.949, 'epoch': 4.0}
{'loss': 0.0432, 'grad_norm': 0.11714501678943634, 'learning_rate': 1.9194818834237705e-05, 'epoch': 4.03}
{'loss': 0.0426, 'grad_norm': 0.32087159156799316, 'learning_rate': 1.9177314895851568e-05, 'epoch': 4.11}
{'loss': 0.0425, 'grad_norm': 0.11864697188138962, 'learning_rate': 1.9159810957465433e-05, 'epoch': 4.2}
{'loss': 0.0427, 'grad_norm': 0.08705300837755203, 'learning_rate': 1.9142307019079292e-05, 'epoch': 4.29}
{'loss': 0.0427, 'grad_norm': 0.1479588747024536, 'learning_rate': 1.9124803080693158e-05, 'epoch': 4.38}
{'loss': 0.0423, 'grad_norm': 0.1372266262769699, 'learning_rate': 1.910729914230702e-05, 'epoch': 4.46}
{'loss': 0.0421, 'grad_norm': 0.18194247782230377, 'learning_rate': 1.9089

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.04065314680337906, 'eval_accuracy': 0.09169818339636679, 'eval_f1': 0.19557002179130004, 'eval_precision_1': 0.8873591989987485, 'eval_recall': 0.10989521979043958, 'eval_runtime': 40.2367, 'eval_samples_per_second': 400.853, 'eval_steps_per_second': 25.077, 'epoch': 5.0}
{'loss': 0.0409, 'grad_norm': 0.16224032640457153, 'learning_rate': 1.8984771573604063e-05, 'epoch': 5.08}
{'loss': 0.0401, 'grad_norm': 0.1613471955060959, 'learning_rate': 1.8967267635217926e-05, 'epoch': 5.16}
{'loss': 0.0403, 'grad_norm': 0.19605810940265656, 'learning_rate': 1.8949763696831788e-05, 'epoch': 5.25}
{'loss': 0.0404, 'grad_norm': 0.1435757577419281, 'learning_rate': 1.8932259758445654e-05, 'epoch': 5.34}
{'loss': 0.0398, 'grad_norm': 0.18259510397911072, 'learning_rate': 1.8914755820059513e-05, 'epoch': 5.43}
{'loss': 0.0397, 'grad_norm': 0.16827939450740814, 'learning_rate': 1.8897251881673378e-05, 'epoch': 5.51}
{'loss': 0.0402, 'grad_norm': 0.2680555284023285, 'learning_rate': 1.88

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.03857192024588585, 'eval_accuracy': 0.12158224316448633, 'eval_f1': 0.2523369449755683, 'eval_precision_1': 0.8801407928862541, 'eval_recall': 0.14728129456258912, 'eval_runtime': 40.0084, 'eval_samples_per_second': 403.14, 'eval_steps_per_second': 25.22, 'epoch': 6.0}
{'loss': 0.0386, 'grad_norm': 0.3159990906715393, 'learning_rate': 1.8792228251356555e-05, 'epoch': 6.04}
{'loss': 0.0388, 'grad_norm': 0.3067430555820465, 'learning_rate': 1.877472431297042e-05, 'epoch': 6.13}
{'loss': 0.0379, 'grad_norm': 0.26573503017425537, 'learning_rate': 1.8757220374584283e-05, 'epoch': 6.21}
{'loss': 0.038, 'grad_norm': 0.31002724170684814, 'learning_rate': 1.8739716436198146e-05, 'epoch': 6.3}
{'loss': 0.038, 'grad_norm': 0.31609198451042175, 'learning_rate': 1.8722212497812008e-05, 'epoch': 6.39}
{'loss': 0.0376, 'grad_norm': 0.2510935068130493, 'learning_rate': 1.8704708559425874e-05, 'epoch': 6.48}
{'loss': 0.0374, 'grad_norm': 0.3474331200122833, 'learning_rate': 1.8687204621

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.036481089890003204, 'eval_accuracy': 0.13522227044454088, 'eval_f1': 0.3086029186651699, 'eval_precision_1': 0.8717209570481407, 'eval_recall': 0.18748837497674994, 'eval_runtime': 39.943, 'eval_samples_per_second': 403.8, 'eval_steps_per_second': 25.261, 'epoch': 7.0}
{'loss': 0.0367, 'grad_norm': 0.32110902667045593, 'learning_rate': 1.859968492910905e-05, 'epoch': 7.0}
{'loss': 0.0362, 'grad_norm': 0.28899163007736206, 'learning_rate': 1.8582180990722913e-05, 'epoch': 7.09}
{'loss': 0.0355, 'grad_norm': 0.4137756824493408, 'learning_rate': 1.8564677052336776e-05, 'epoch': 7.18}
{'loss': 0.0358, 'grad_norm': 0.24472381174564362, 'learning_rate': 1.854717311395064e-05, 'epoch': 7.26}
{'loss': 0.0357, 'grad_norm': 0.2519192695617676, 'learning_rate': 1.8529669175564504e-05, 'epoch': 7.35}
{'loss': 0.0353, 'grad_norm': 0.3090006411075592, 'learning_rate': 1.8512165237178366e-05, 'epoch': 7.44}
{'loss': 0.0353, 'grad_norm': 0.33585426211357117, 'learning_rate': 1.84946612

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.03444034233689308, 'eval_accuracy': 0.1638043276086552, 'eval_f1': 0.37490067660301946, 'eval_precision_1': 0.839534131349078, 'eval_recall': 0.24133548267096533, 'eval_runtime': 39.9309, 'eval_samples_per_second': 403.923, 'eval_steps_per_second': 25.269, 'epoch': 8.0}
{'loss': 0.0338, 'grad_norm': 0.28514766693115234, 'learning_rate': 1.838963766847541e-05, 'epoch': 8.05}
{'loss': 0.0333, 'grad_norm': 0.3477918803691864, 'learning_rate': 1.837213373008927e-05, 'epoch': 8.14}
{'loss': 0.0335, 'grad_norm': 0.1898246556520462, 'learning_rate': 1.8354629791703137e-05, 'epoch': 8.23}
{'loss': 0.0332, 'grad_norm': 0.5058656930923462, 'learning_rate': 1.8337125853316996e-05, 'epoch': 8.31}
{'loss': 0.0331, 'grad_norm': 0.2787958085536957, 'learning_rate': 1.831962191493086e-05, 'epoch': 8.4}
{'loss': 0.0327, 'grad_norm': 0.20034998655319214, 'learning_rate': 1.8302117976544724e-05, 'epoch': 8.49}
{'loss': 0.0328, 'grad_norm': 0.25510236620903015, 'learning_rate': 1.828461403

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.03236151859164238, 'eval_accuracy': 0.20497240994481988, 'eval_f1': 0.4326688458174117, 'eval_precision_1': 0.8403075822603719, 'eval_recall': 0.29133858267716534, 'eval_runtime': 39.8707, 'eval_samples_per_second': 404.533, 'eval_steps_per_second': 25.307, 'epoch': 9.0}
{'loss': 0.0317, 'grad_norm': 0.2591475248336792, 'learning_rate': 1.8197094346227905e-05, 'epoch': 9.01}
{'loss': 0.0311, 'grad_norm': 0.22669780254364014, 'learning_rate': 1.8179590407841764e-05, 'epoch': 9.1}
{'loss': 0.0308, 'grad_norm': 0.2788051664829254, 'learning_rate': 1.816208646945563e-05, 'epoch': 9.19}
{'loss': 0.031, 'grad_norm': 0.609571099281311, 'learning_rate': 1.814458253106949e-05, 'epoch': 9.28}
{'loss': 0.0307, 'grad_norm': 0.6225036978721619, 'learning_rate': 1.8127078592683357e-05, 'epoch': 9.36}
{'loss': 0.0311, 'grad_norm': 0.3888065814971924, 'learning_rate': 1.8109574654297216e-05, 'epoch': 9.45}
{'loss': 0.0308, 'grad_norm': 0.35021093487739563, 'learning_rate': 1.8092070715

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.03060835972428322, 'eval_accuracy': 0.23330646661293322, 'eval_f1': 0.47096339546373234, 'eval_precision_1': 0.8544654498044328, 'eval_recall': 0.3250666501333003, 'eval_runtime': 40.0394, 'eval_samples_per_second': 402.829, 'eval_steps_per_second': 25.2, 'epoch': 10.0}
{'loss': 0.0288, 'grad_norm': 0.8402302861213684, 'learning_rate': 1.798704708559426e-05, 'epoch': 10.06}
{'loss': 0.0286, 'grad_norm': 0.4662802517414093, 'learning_rate': 1.7969543147208125e-05, 'epoch': 10.15}
{'loss': 0.0285, 'grad_norm': 0.3380548059940338, 'learning_rate': 1.7952039208821987e-05, 'epoch': 10.24}
{'loss': 0.0288, 'grad_norm': 0.46222132444381714, 'learning_rate': 1.793453527043585e-05, 'epoch': 10.33}
{'loss': 0.029, 'grad_norm': 0.5364705920219421, 'learning_rate': 1.7917031332049712e-05, 'epoch': 10.41}
{'loss': 0.0287, 'grad_norm': 0.45529064536094666, 'learning_rate': 1.7899527393663578e-05, 'epoch': 10.5}
{'loss': 0.0283, 'grad_norm': 0.3129678964614868, 'learning_rate': 1.7882

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.02873058430850506, 'eval_accuracy': 0.28203856407712813, 'eval_f1': 0.5333389494966514, 'eval_precision_1': 0.831713084603258, 'eval_recall': 0.3925227850455701, 'eval_runtime': 39.9076, 'eval_samples_per_second': 404.158, 'eval_steps_per_second': 25.283, 'epoch': 11.0}
{'loss': 0.0271, 'grad_norm': 0.5309699177742004, 'learning_rate': 1.7794503763346755e-05, 'epoch': 11.03}
{'loss': 0.0265, 'grad_norm': 0.5001007914543152, 'learning_rate': 1.7776999824960617e-05, 'epoch': 11.12}
{'loss': 0.0265, 'grad_norm': 0.3864486515522003, 'learning_rate': 1.775949588657448e-05, 'epoch': 11.2}
{'loss': 0.0268, 'grad_norm': 0.7550168037414551, 'learning_rate': 1.7741991948188345e-05, 'epoch': 11.29}
{'loss': 0.0263, 'grad_norm': 0.3290826380252838, 'learning_rate': 1.7724488009802207e-05, 'epoch': 11.38}
{'loss': 0.0263, 'grad_norm': 0.37709158658981323, 'learning_rate': 1.770698407141607e-05, 'epoch': 11.47}
{'loss': 0.0264, 'grad_norm': 0.42516395449638367, 'learning_rate': 1.768

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.02675861306488514, 'eval_accuracy': 0.29741459482918964, 'eval_f1': 0.5595050854566426, 'eval_precision_1': 0.8647177027289816, 'eval_recall': 0.4135408270816542, 'eval_runtime': 40.0878, 'eval_samples_per_second': 402.342, 'eval_steps_per_second': 25.17, 'epoch': 12.0}
{'loss': 0.0243, 'grad_norm': 0.5632449388504028, 'learning_rate': 1.7584456502713113e-05, 'epoch': 12.08}
{'loss': 0.0244, 'grad_norm': 0.3727603256702423, 'learning_rate': 1.7566952564326975e-05, 'epoch': 12.17}
{'loss': 0.0246, 'grad_norm': 0.54230135679245, 'learning_rate': 1.754944862594084e-05, 'epoch': 12.25}
{'loss': 0.0244, 'grad_norm': 0.5587814450263977, 'learning_rate': 1.75319446875547e-05, 'epoch': 12.34}
{'loss': 0.0242, 'grad_norm': 0.46581748127937317, 'learning_rate': 1.7514440749168565e-05, 'epoch': 12.43}
{'loss': 0.0244, 'grad_norm': 0.29937005043029785, 'learning_rate': 1.7496936810782428e-05, 'epoch': 12.52}
{'loss': 0.0243, 'grad_norm': 0.5393856763839722, 'learning_rate': 1.74794

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.025102252140641212, 'eval_accuracy': 0.3317626635253271, 'eval_f1': 0.59345433873391, 'eval_precision_1': 0.8599082136973405, 'eval_recall': 0.45306590613181225, 'eval_runtime': 39.9698, 'eval_samples_per_second': 403.53, 'eval_steps_per_second': 25.244, 'epoch': 13.0}
{'loss': 0.0231, 'grad_norm': 0.786164402961731, 'learning_rate': 1.739191318046561e-05, 'epoch': 13.04}
{'loss': 0.0227, 'grad_norm': 0.44392699003219604, 'learning_rate': 1.7374409242079467e-05, 'epoch': 13.13}
{'loss': 0.0227, 'grad_norm': 0.3270159065723419, 'learning_rate': 1.7356905303693333e-05, 'epoch': 13.22}
{'loss': 0.0227, 'grad_norm': 0.5640963315963745, 'learning_rate': 1.7339401365307195e-05, 'epoch': 13.3}
{'loss': 0.0226, 'grad_norm': 0.5049123764038086, 'learning_rate': 1.732189742692106e-05, 'epoch': 13.39}
{'loss': 0.0224, 'grad_norm': 0.5599268674850464, 'learning_rate': 1.730439348853492e-05, 'epoch': 13.48}
{'loss': 0.0223, 'grad_norm': 0.6371780037879944, 'learning_rate': 1.7286889

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.023793093860149384, 'eval_accuracy': 0.3547027094054188, 'eval_f1': 0.623077075765696, 'eval_precision_1': 0.8661221786877104, 'eval_recall': 0.4865459730919462, 'eval_runtime': 40.0836, 'eval_samples_per_second': 402.384, 'eval_steps_per_second': 25.172, 'epoch': 14.0}
{'loss': 0.0224, 'grad_norm': 0.6093880534172058, 'learning_rate': 1.71993698582181e-05, 'epoch': 14.0}
{'loss': 0.0206, 'grad_norm': 1.3255499601364136, 'learning_rate': 1.7181865919831963e-05, 'epoch': 14.09}
{'loss': 0.0207, 'grad_norm': 0.39217042922973633, 'learning_rate': 1.716436198144583e-05, 'epoch': 14.18}
{'loss': 0.0207, 'grad_norm': 0.5615333914756775, 'learning_rate': 1.714685804305969e-05, 'epoch': 14.27}
{'loss': 0.021, 'grad_norm': 0.3812873065471649, 'learning_rate': 1.7129354104673553e-05, 'epoch': 14.35}
{'loss': 0.0208, 'grad_norm': 0.4928354322910309, 'learning_rate': 1.7111850166287416e-05, 'epoch': 14.44}
{'loss': 0.0208, 'grad_norm': 0.4507003128528595, 'learning_rate': 1.7094346

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.022656457498669624, 'eval_accuracy': 0.394010788021576, 'eval_f1': 0.6539488598159569, 'eval_precision_1': 0.8598130841121495, 'eval_recall': 0.5276210552421104, 'eval_runtime': 40.0618, 'eval_samples_per_second': 402.603, 'eval_steps_per_second': 25.186, 'epoch': 15.0}
{'loss': 0.0196, 'grad_norm': 0.37327104806900024, 'learning_rate': 1.698932259758446e-05, 'epoch': 15.05}
{'loss': 0.0194, 'grad_norm': 0.2653864920139313, 'learning_rate': 1.697181865919832e-05, 'epoch': 15.14}
{'loss': 0.0192, 'grad_norm': 0.25217050313949585, 'learning_rate': 1.6954314720812183e-05, 'epoch': 15.23}
{'loss': 0.0191, 'grad_norm': 0.23397937417030334, 'learning_rate': 1.693681078242605e-05, 'epoch': 15.32}
{'loss': 0.0189, 'grad_norm': 0.46339306235313416, 'learning_rate': 1.691930684403991e-05, 'epoch': 15.4}
{'loss': 0.0194, 'grad_norm': 0.6321467161178589, 'learning_rate': 1.6901802905653773e-05, 'epoch': 15.49}
{'loss': 0.0193, 'grad_norm': 0.539546549320221, 'learning_rate': 1.6884

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.021526096388697624, 'eval_accuracy': 0.4250108500217, 'eval_f1': 0.675445040972026, 'eval_precision_1': 0.8608056849282182, 'eval_recall': 0.5557691115382231, 'eval_runtime': 40.1541, 'eval_samples_per_second': 401.677, 'eval_steps_per_second': 25.128, 'epoch': 16.0}
{'loss': 0.0186, 'grad_norm': 0.3711234927177429, 'learning_rate': 1.679677927533695e-05, 'epoch': 16.02}
{'loss': 0.0175, 'grad_norm': 0.36450910568237305, 'learning_rate': 1.6779275336950816e-05, 'epoch': 16.1}
{'loss': 0.0176, 'grad_norm': 0.37666773796081543, 'learning_rate': 1.676177139856468e-05, 'epoch': 16.19}
{'loss': 0.0176, 'grad_norm': 0.6738249659538269, 'learning_rate': 1.674426746017854e-05, 'epoch': 16.28}
{'loss': 0.0182, 'grad_norm': 0.34816932678222656, 'learning_rate': 1.6726763521792403e-05, 'epoch': 16.37}
{'loss': 0.0176, 'grad_norm': 0.4609517455101013, 'learning_rate': 1.670925958340627e-05, 'epoch': 16.45}
{'loss': 0.018, 'grad_norm': 0.6505247354507446, 'learning_rate': 1.66917556

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.0206122025847435, 'eval_accuracy': 0.4543989087978176, 'eval_f1': 0.696998245127921, 'eval_precision_1': 0.8623668693148056, 'eval_recall': 0.5848471696943394, 'eval_runtime': 40.0157, 'eval_samples_per_second': 403.067, 'eval_steps_per_second': 25.215, 'epoch': 17.0}
{'loss': 0.0166, 'grad_norm': 0.7527020573616028, 'learning_rate': 1.6586732014703312e-05, 'epoch': 17.07}
{'loss': 0.0166, 'grad_norm': 0.3269568085670471, 'learning_rate': 1.656922807631717e-05, 'epoch': 17.15}
{'loss': 0.0161, 'grad_norm': 0.38415077328681946, 'learning_rate': 1.6551724137931037e-05, 'epoch': 17.24}
{'loss': 0.0166, 'grad_norm': 0.2558392584323883, 'learning_rate': 1.65342201995449e-05, 'epoch': 17.33}
{'loss': 0.0165, 'grad_norm': 0.43544864654541016, 'learning_rate': 1.651671626115876e-05, 'epoch': 17.42}
{'loss': 0.0163, 'grad_norm': 0.4646012783050537, 'learning_rate': 1.6499212322772624e-05, 'epoch': 17.5}
{'loss': 0.0166, 'grad_norm': 0.44255316257476807, 'learning_rate': 1.648170

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.019723104313015938, 'eval_accuracy': 0.47696695393390787, 'eval_f1': 0.714306537424386, 'eval_precision_1': 0.8663484486873508, 'eval_recall': 0.6076632153264306, 'eval_runtime': 39.9136, 'eval_samples_per_second': 404.098, 'eval_steps_per_second': 25.28, 'epoch': 18.0}
{'loss': 0.0156, 'grad_norm': 1.1020652055740356, 'learning_rate': 1.6394188692455804e-05, 'epoch': 18.03}
{'loss': 0.0151, 'grad_norm': 0.32700321078300476, 'learning_rate': 1.6376684754069667e-05, 'epoch': 18.12}
{'loss': 0.0152, 'grad_norm': 0.4732944667339325, 'learning_rate': 1.6359180815683532e-05, 'epoch': 18.2}
{'loss': 0.0156, 'grad_norm': 0.6586824655532837, 'learning_rate': 1.634167687729739e-05, 'epoch': 18.29}
{'loss': 0.015, 'grad_norm': 0.4974569082260132, 'learning_rate': 1.6324172938911257e-05, 'epoch': 18.38}
{'loss': 0.0152, 'grad_norm': 0.5177997350692749, 'learning_rate': 1.630666900052512e-05, 'epoch': 18.47}
{'loss': 0.0151, 'grad_norm': 0.8022500872612, 'learning_rate': 1.62891650

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.018915487453341484, 'eval_accuracy': 0.495504991009982, 'eval_f1': 0.728150095430156, 'eval_precision_1': 0.8685567010309279, 'eval_recall': 0.6268212536425073, 'eval_runtime': 39.9915, 'eval_samples_per_second': 403.311, 'eval_steps_per_second': 25.23, 'epoch': 19.0}
{'loss': 0.0136, 'grad_norm': 0.5671690702438354, 'learning_rate': 1.6184141431822162e-05, 'epoch': 19.08}
{'loss': 0.0138, 'grad_norm': 0.46082350611686707, 'learning_rate': 1.6166637493436024e-05, 'epoch': 19.17}
{'loss': 0.0137, 'grad_norm': 0.48139142990112305, 'learning_rate': 1.6149133555049887e-05, 'epoch': 19.25}
{'loss': 0.0142, 'grad_norm': 0.4510304033756256, 'learning_rate': 1.6131629616663753e-05, 'epoch': 19.34}
{'loss': 0.0142, 'grad_norm': 0.5211141705513, 'learning_rate': 1.6114125678277615e-05, 'epoch': 19.43}
{'loss': 0.0142, 'grad_norm': 0.336719810962677, 'learning_rate': 1.6096621739891477e-05, 'epoch': 19.52}
{'loss': 0.0138, 'grad_norm': 0.651703953742981, 'learning_rate': 1.6079117

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.017832765355706215, 'eval_accuracy': 0.5178250356500713, 'eval_f1': 0.7434910977901047, 'eval_precision_1': 0.8792906420620477, 'eval_recall': 0.6440262880525761, 'eval_runtime': 39.9894, 'eval_samples_per_second': 403.332, 'eval_steps_per_second': 25.232, 'epoch': 20.0}
{'loss': 0.0134, 'grad_norm': 0.43431761860847473, 'learning_rate': 1.5991598109574654e-05, 'epoch': 20.04}
{'loss': 0.013, 'grad_norm': 0.4084432125091553, 'learning_rate': 1.597409417118852e-05, 'epoch': 20.13}
{'loss': 0.013, 'grad_norm': 0.37673354148864746, 'learning_rate': 1.5956590232802382e-05, 'epoch': 20.22}
{'loss': 0.0132, 'grad_norm': 0.6322736740112305, 'learning_rate': 1.5939086294416245e-05, 'epoch': 20.3}
{'loss': 0.0128, 'grad_norm': 0.3550415337085724, 'learning_rate': 1.5921582356030107e-05, 'epoch': 20.39}
{'loss': 0.0131, 'grad_norm': 0.46923038363456726, 'learning_rate': 1.5904078417643973e-05, 'epoch': 20.48}
{'loss': 0.013, 'grad_norm': 0.4004756510257721, 'learning_rate': 1.588

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.017032738775014877, 'eval_accuracy': 0.5421290842581685, 'eval_f1': 0.7567212649287501, 'eval_precision_1': 0.884945834889802, 'eval_recall': 0.6609523219046438, 'eval_runtime': 39.9663, 'eval_samples_per_second': 403.565, 'eval_steps_per_second': 25.246, 'epoch': 21.0}
{'loss': 0.0128, 'grad_norm': 0.44057223200798035, 'learning_rate': 1.579905478732715e-05, 'epoch': 21.0}
{'loss': 0.0119, 'grad_norm': 0.30650773644447327, 'learning_rate': 1.5781550848941016e-05, 'epoch': 21.09}
{'loss': 0.0117, 'grad_norm': 0.23099571466445923, 'learning_rate': 1.5764046910554875e-05, 'epoch': 21.18}
{'loss': 0.0121, 'grad_norm': 0.34959885478019714, 'learning_rate': 1.574654297216874e-05, 'epoch': 21.27}
{'loss': 0.0125, 'grad_norm': 0.8765839338302612, 'learning_rate': 1.5729039033782603e-05, 'epoch': 21.35}
{'loss': 0.0117, 'grad_norm': 0.31480899453163147, 'learning_rate': 1.5711535095396465e-05, 'epoch': 21.44}
{'loss': 0.0121, 'grad_norm': 0.3053290843963623, 'learning_rate': 1.

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.01681213639676571, 'eval_accuracy': 0.5693471386942774, 'eval_f1': 0.7704789595556871, 'eval_precision_1': 0.8787878787878788, 'eval_recall': 0.6859383718767438, 'eval_runtime': 40.0066, 'eval_samples_per_second': 403.158, 'eval_steps_per_second': 25.221, 'epoch': 22.0}
{'loss': 0.0113, 'grad_norm': 0.3512958884239197, 'learning_rate': 1.5589007526693508e-05, 'epoch': 22.05}
{'loss': 0.0109, 'grad_norm': 0.6519533395767212, 'learning_rate': 1.557150358830737e-05, 'epoch': 22.14}
{'loss': 0.0112, 'grad_norm': 0.2994210422039032, 'learning_rate': 1.5553999649921233e-05, 'epoch': 22.23}
{'loss': 0.0114, 'grad_norm': 0.5141429901123047, 'learning_rate': 1.5536495711535095e-05, 'epoch': 22.32}
{'loss': 0.0108, 'grad_norm': 0.4228489100933075, 'learning_rate': 1.551899177314896e-05, 'epoch': 22.41}
{'loss': 0.0112, 'grad_norm': 0.8493528962135315, 'learning_rate': 1.5501487834762823e-05, 'epoch': 22.49}
{'loss': 0.0112, 'grad_norm': 0.32973700761795044, 'learning_rate': 1.548

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.016110597178339958, 'eval_accuracy': 0.5895591791183582, 'eval_f1': 0.7791443942775096, 'eval_precision_1': 0.8814435007240988, 'eval_recall': 0.6981213962427925, 'eval_runtime': 40.1954, 'eval_samples_per_second': 401.265, 'eval_steps_per_second': 25.102, 'epoch': 23.0}
{'loss': 0.0113, 'grad_norm': 0.3960302472114563, 'learning_rate': 1.5396464204446004e-05, 'epoch': 23.02}
{'loss': 0.0102, 'grad_norm': 0.45575353503227234, 'learning_rate': 1.5378960266059866e-05, 'epoch': 23.11}
{'loss': 0.0101, 'grad_norm': 0.666905403137207, 'learning_rate': 1.5361456327673728e-05, 'epoch': 23.19}
{'loss': 0.0105, 'grad_norm': 0.4618634879589081, 'learning_rate': 1.534395238928759e-05, 'epoch': 23.28}
{'loss': 0.0102, 'grad_norm': 0.2218555212020874, 'learning_rate': 1.5326448450901453e-05, 'epoch': 23.37}
{'loss': 0.0105, 'grad_norm': 0.23376810550689697, 'learning_rate': 1.530894451251532e-05, 'epoch': 23.46}
{'loss': 0.0109, 'grad_norm': 0.19805067777633667, 'learning_rate': 1.5

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.015726709738373756, 'eval_accuracy': 0.6043772087544175, 'eval_f1': 0.7891042034980519, 'eval_precision_1': 0.884013383071184, 'eval_recall': 0.7125984251968503, 'eval_runtime': 40.194, 'eval_samples_per_second': 401.278, 'eval_steps_per_second': 25.103, 'epoch': 24.0}
{'loss': 0.0098, 'grad_norm': 0.18009205162525177, 'learning_rate': 1.5186416943812358e-05, 'epoch': 24.07}
{'loss': 0.0094, 'grad_norm': 0.4124247133731842, 'learning_rate': 1.5168913005426222e-05, 'epoch': 24.16}
{'loss': 0.0096, 'grad_norm': 0.3072352111339569, 'learning_rate': 1.5151409067040086e-05, 'epoch': 24.24}
{'loss': 0.0097, 'grad_norm': 0.24232611060142517, 'learning_rate': 1.5133905128653947e-05, 'epoch': 24.33}
{'loss': 0.0095, 'grad_norm': 0.3388764560222626, 'learning_rate': 1.511640119026781e-05, 'epoch': 24.42}
{'loss': 0.0097, 'grad_norm': 0.2183600515127182, 'learning_rate': 1.5098897251881675e-05, 'epoch': 24.51}
{'loss': 0.0093, 'grad_norm': 0.35863375663757324, 'learning_rate': 1.5

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.015111688524484634, 'eval_accuracy': 0.6219852439704879, 'eval_f1': 0.8000818777292577, 'eval_precision_1': 0.8894788743078207, 'eval_recall': 0.7270134540269081, 'eval_runtime': 40.0854, 'eval_samples_per_second': 402.366, 'eval_steps_per_second': 25.171, 'epoch': 25.0}
{'loss': 0.0095, 'grad_norm': 0.2738175690174103, 'learning_rate': 1.4993873621564854e-05, 'epoch': 25.03}
{'loss': 0.0085, 'grad_norm': 0.4034229815006256, 'learning_rate': 1.4976369683178718e-05, 'epoch': 25.12}
{'loss': 0.0088, 'grad_norm': 0.5125371217727661, 'learning_rate': 1.4958865744792578e-05, 'epoch': 25.21}
{'loss': 0.0092, 'grad_norm': 0.34635260701179504, 'learning_rate': 1.4941361806406442e-05, 'epoch': 25.29}
{'loss': 0.0089, 'grad_norm': 0.40314793586730957, 'learning_rate': 1.4923857868020306e-05, 'epoch': 25.38}
{'loss': 0.0088, 'grad_norm': 0.5243902206420898, 'learning_rate': 1.490635392963417e-05, 'epoch': 25.47}
{'loss': 0.009, 'grad_norm': 0.4887036085128784, 'learning_rate': 1.4

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.015182815492153168, 'eval_accuracy': 0.6407712815425631, 'eval_f1': 0.805718722571062, 'eval_precision_1': 0.8844215519158082, 'eval_recall': 0.7398784797569595, 'eval_runtime': 40.0778, 'eval_samples_per_second': 402.442, 'eval_steps_per_second': 25.176, 'epoch': 26.0}
{'loss': 0.0083, 'grad_norm': 0.3147730827331543, 'learning_rate': 1.478382636093121e-05, 'epoch': 26.08}
{'loss': 0.0082, 'grad_norm': 0.555785596370697, 'learning_rate': 1.4766322422545074e-05, 'epoch': 26.17}
{'loss': 0.0082, 'grad_norm': 0.350954532623291, 'learning_rate': 1.4748818484158938e-05, 'epoch': 26.26}
{'loss': 0.0082, 'grad_norm': 0.4208489656448364, 'learning_rate': 1.4731314545772799e-05, 'epoch': 26.34}
{'loss': 0.0083, 'grad_norm': 0.23982276022434235, 'learning_rate': 1.4713810607386663e-05, 'epoch': 26.43}
{'loss': 0.0084, 'grad_norm': 0.7102488279342651, 'learning_rate': 1.4696306669000527e-05, 'epoch': 26.52}
{'loss': 0.0085, 'grad_norm': 0.18051111698150635, 'learning_rate': 1.467

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.014692414551973343, 'eval_accuracy': 0.6548453096906194, 'eval_f1': 0.8138075667630693, 'eval_precision_1': 0.8954185120398973, 'eval_recall': 0.7458304916609834, 'eval_runtime': 40.1356, 'eval_samples_per_second': 401.863, 'eval_steps_per_second': 25.14, 'epoch': 27.0}
{'loss': 0.0082, 'grad_norm': 1.2069952487945557, 'learning_rate': 1.4591283038683706e-05, 'epoch': 27.04}
{'loss': 0.0074, 'grad_norm': 0.2888842523097992, 'learning_rate': 1.4573779100297568e-05, 'epoch': 27.13}
{'loss': 0.0076, 'grad_norm': 0.4537661075592041, 'learning_rate': 1.455627516191143e-05, 'epoch': 27.22}
{'loss': 0.008, 'grad_norm': 0.17015302181243896, 'learning_rate': 1.4538771223525294e-05, 'epoch': 27.31}
{'loss': 0.0076, 'grad_norm': 0.4554646611213684, 'learning_rate': 1.4521267285139158e-05, 'epoch': 27.39}
{'loss': 0.008, 'grad_norm': 0.19741781055927277, 'learning_rate': 1.4503763346753022e-05, 'epoch': 27.48}
{'loss': 0.0078, 'grad_norm': 0.24183699488639832, 'learning_rate': 1.44

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.01438866276293993, 'eval_accuracy': 0.6752433504867009, 'eval_f1': 0.818687393212503, 'eval_precision_1': 0.8905612244897959, 'eval_recall': 0.7575485150970301, 'eval_runtime': 40.0595, 'eval_samples_per_second': 402.626, 'eval_steps_per_second': 25.188, 'epoch': 28.0}
{'loss': 0.0081, 'grad_norm': 0.44629374146461487, 'learning_rate': 1.4398739716436198e-05, 'epoch': 28.01}
{'loss': 0.0071, 'grad_norm': 0.6275150775909424, 'learning_rate': 1.4381235778050062e-05, 'epoch': 28.09}
{'loss': 0.0073, 'grad_norm': 0.45620298385620117, 'learning_rate': 1.4363731839663926e-05, 'epoch': 28.18}
{'loss': 0.0073, 'grad_norm': 0.14996328949928284, 'learning_rate': 1.434622790127779e-05, 'epoch': 28.27}
{'loss': 0.0075, 'grad_norm': 0.4120556712150574, 'learning_rate': 1.432872396289165e-05, 'epoch': 28.36}
{'loss': 0.0075, 'grad_norm': 0.4313376247882843, 'learning_rate': 1.4311220024505514e-05, 'epoch': 28.44}
{'loss': 0.0075, 'grad_norm': 0.6018165946006775, 'learning_rate': 1.42

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.014028962701559067, 'eval_accuracy': 0.6878913757827516, 'eval_f1': 0.8266604490040637, 'eval_precision_1': 0.8931836176491759, 'eval_recall': 0.7693595387190775, 'eval_runtime': 40.1672, 'eval_samples_per_second': 401.547, 'eval_steps_per_second': 25.12, 'epoch': 29.0}
{'loss': 0.007, 'grad_norm': 0.4187922477722168, 'learning_rate': 1.4188692455802557e-05, 'epoch': 29.06}
{'loss': 0.0065, 'grad_norm': 0.3964548110961914, 'learning_rate': 1.4171188517416418e-05, 'epoch': 29.14}
{'loss': 0.0069, 'grad_norm': 0.06022810935974121, 'learning_rate': 1.4153684579030282e-05, 'epoch': 29.23}
{'loss': 0.0069, 'grad_norm': 0.6650534272193909, 'learning_rate': 1.4136180640644146e-05, 'epoch': 29.32}
{'loss': 0.0071, 'grad_norm': 0.5677489638328552, 'learning_rate': 1.411867670225801e-05, 'epoch': 29.41}
{'loss': 0.0067, 'grad_norm': 0.5725829005241394, 'learning_rate': 1.4101172763871872e-05, 'epoch': 29.49}
{'loss': 0.0068, 'grad_norm': 0.5570876002311707, 'learning_rate': 1.408

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.014293638989329338, 'eval_accuracy': 0.6984313968627938, 'eval_f1': 0.8289100704517198, 'eval_precision_1': 0.8907970214130474, 'eval_recall': 0.7750635501271003, 'eval_runtime': 40.0463, 'eval_samples_per_second': 402.759, 'eval_steps_per_second': 25.196, 'epoch': 30.0}
{'loss': 0.0067, 'grad_norm': 0.2761797606945038, 'learning_rate': 1.399614913355505e-05, 'epoch': 30.02}
{'loss': 0.0063, 'grad_norm': 0.5428466796875, 'learning_rate': 1.3978645195168914e-05, 'epoch': 30.11}
{'loss': 0.0063, 'grad_norm': 0.15372247993946075, 'learning_rate': 1.3961141256782778e-05, 'epoch': 30.19}
{'loss': 0.0065, 'grad_norm': 0.312845915555954, 'learning_rate': 1.3943637318396642e-05, 'epoch': 30.28}
{'loss': 0.0066, 'grad_norm': 0.36218908429145813, 'learning_rate': 1.3926133380010502e-05, 'epoch': 30.37}
{'loss': 0.0064, 'grad_norm': 0.15754231810569763, 'learning_rate': 1.3908629441624366e-05, 'epoch': 30.46}
{'loss': 0.0066, 'grad_norm': 0.2439102679491043, 'learning_rate': 1.389

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.013930306769907475, 'eval_accuracy': 0.705251410502821, 'eval_f1': 0.8324987564251368, 'eval_precision_1': 0.8949094538713818, 'eval_recall': 0.7782255564511129, 'eval_runtime': 40.2339, 'eval_samples_per_second': 400.881, 'eval_steps_per_second': 25.078, 'epoch': 31.0}
{'loss': 0.006, 'grad_norm': 0.36950168013572693, 'learning_rate': 1.378610187292141e-05, 'epoch': 31.07}
{'loss': 0.006, 'grad_norm': 0.1309751719236374, 'learning_rate': 1.376859793453527e-05, 'epoch': 31.16}
{'loss': 0.0062, 'grad_norm': 0.36561939120292664, 'learning_rate': 1.3751093996149134e-05, 'epoch': 31.24}
{'loss': 0.0062, 'grad_norm': 0.3577694296836853, 'learning_rate': 1.3733590057762998e-05, 'epoch': 31.33}
{'loss': 0.006, 'grad_norm': 0.9678152799606323, 'learning_rate': 1.3716086119376862e-05, 'epoch': 31.42}
{'loss': 0.0065, 'grad_norm': 0.2390802949666977, 'learning_rate': 1.3698582180990724e-05, 'epoch': 31.51}
{'loss': 0.0064, 'grad_norm': 0.251815527677536, 'learning_rate': 1.368107

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.014089885167777538, 'eval_accuracy': 0.7108934217868436, 'eval_f1': 0.8341748472859106, 'eval_precision_1': 0.8950584390209244, 'eval_recall': 0.7810465620931242, 'eval_runtime': 40.2619, 'eval_samples_per_second': 400.602, 'eval_steps_per_second': 25.061, 'epoch': 32.0}
{'loss': 0.006, 'grad_norm': 0.24881237745285034, 'learning_rate': 1.3593558550673901e-05, 'epoch': 32.03}
{'loss': 0.0055, 'grad_norm': 0.20530402660369873, 'learning_rate': 1.3576054612287765e-05, 'epoch': 32.12}
{'loss': 0.0058, 'grad_norm': 0.2226240634918213, 'learning_rate': 1.355855067390163e-05, 'epoch': 32.21}
{'loss': 0.0056, 'grad_norm': 0.6278448104858398, 'learning_rate': 1.3541046735515494e-05, 'epoch': 32.29}
{'loss': 0.006, 'grad_norm': 0.6227023005485535, 'learning_rate': 1.3523542797129354e-05, 'epoch': 32.38}
{'loss': 0.0062, 'grad_norm': 0.19740653038024902, 'learning_rate': 1.3506038858743218e-05, 'epoch': 32.47}
{'loss': 0.0059, 'grad_norm': 0.15342991054058075, 'learning_rate': 1.

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.01391899399459362, 'eval_accuracy': 0.7179614359228719, 'eval_f1': 0.8376113581144736, 'eval_precision_1': 0.8971145335457603, 'eval_recall': 0.7855105710211421, 'eval_runtime': 40.1622, 'eval_samples_per_second': 401.596, 'eval_steps_per_second': 25.123, 'epoch': 33.0}
{'loss': 0.0054, 'grad_norm': 0.47292327880859375, 'learning_rate': 1.3383511290040261e-05, 'epoch': 33.08}
{'loss': 0.0053, 'grad_norm': 0.25051283836364746, 'learning_rate': 1.3366007351654122e-05, 'epoch': 33.17}
{'loss': 0.0057, 'grad_norm': 1.0154417753219604, 'learning_rate': 1.3348503413267986e-05, 'epoch': 33.26}
{'loss': 0.0055, 'grad_norm': 0.2178240269422531, 'learning_rate': 1.333099947488185e-05, 'epoch': 33.35}
{'loss': 0.0056, 'grad_norm': 0.16878817975521088, 'learning_rate': 1.3313495536495714e-05, 'epoch': 33.43}
{'loss': 0.0058, 'grad_norm': 0.06477387249469757, 'learning_rate': 1.3295991598109576e-05, 'epoch': 33.52}
{'loss': 0.0054, 'grad_norm': 0.27852529287338257, 'learning_rate': 

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.01343468390405178, 'eval_accuracy': 0.7309194618389236, 'eval_f1': 0.8431546155107394, 'eval_precision_1': 0.8948907141862732, 'eval_recall': 0.7970735941471883, 'eval_runtime': 40.1508, 'eval_samples_per_second': 401.711, 'eval_steps_per_second': 25.13, 'epoch': 34.0}
{'loss': 0.0055, 'grad_norm': 0.1564510315656662, 'learning_rate': 1.3190967967792753e-05, 'epoch': 34.05}
{'loss': 0.0052, 'grad_norm': 0.14176882803440094, 'learning_rate': 1.3173464029406617e-05, 'epoch': 34.13}
{'loss': 0.0052, 'grad_norm': 0.36492013931274414, 'learning_rate': 1.3155960091020481e-05, 'epoch': 34.22}
{'loss': 0.0053, 'grad_norm': 0.3674570620059967, 'learning_rate': 1.3138456152634345e-05, 'epoch': 34.31}
{'loss': 0.0054, 'grad_norm': 0.4785017967224121, 'learning_rate': 1.3120952214248206e-05, 'epoch': 34.4}
{'loss': 0.0055, 'grad_norm': 0.30706989765167236, 'learning_rate': 1.310344827586207e-05, 'epoch': 34.48}
{'loss': 0.0053, 'grad_norm': 0.06882581859827042, 'learning_rate': 1.3

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.013782420195639133, 'eval_accuracy': 0.7369334738669477, 'eval_f1': 0.843637912114909, 'eval_precision_1': 0.8943018854821349, 'eval_recall': 0.7984065968131936, 'eval_runtime': 40.0716, 'eval_samples_per_second': 402.505, 'eval_steps_per_second': 25.18, 'epoch': 35.0}
{'loss': 0.0056, 'grad_norm': 0.48335909843444824, 'learning_rate': 1.2998424645545249e-05, 'epoch': 35.01}
{'loss': 0.0051, 'grad_norm': 0.21536946296691895, 'learning_rate': 1.2980920707159113e-05, 'epoch': 35.1}
{'loss': 0.005, 'grad_norm': 0.4983856678009033, 'learning_rate': 1.2963416768772974e-05, 'epoch': 35.18}
{'loss': 0.0049, 'grad_norm': 0.1497238725423813, 'learning_rate': 1.2945912830386838e-05, 'epoch': 35.27}
{'loss': 0.005, 'grad_norm': 0.3281732201576233, 'learning_rate': 1.2928408892000702e-05, 'epoch': 35.36}
{'loss': 0.0053, 'grad_norm': 0.3274773061275482, 'learning_rate': 1.2910904953614566e-05, 'epoch': 35.45}
{'loss': 0.0052, 'grad_norm': 0.12872961163520813, 'learning_rate': 1.289

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.013690296560525894, 'eval_accuracy': 0.7418314836629674, 'eval_f1': 0.8465256500581653, 'eval_precision_1': 0.897758470894874, 'eval_recall': 0.8008246016492033, 'eval_runtime': 40.2256, 'eval_samples_per_second': 400.963, 'eval_steps_per_second': 25.084, 'epoch': 36.0}
{'loss': 0.0047, 'grad_norm': 0.2838677763938904, 'learning_rate': 1.2788377384911605e-05, 'epoch': 36.06}
{'loss': 0.0048, 'grad_norm': 0.0901000052690506, 'learning_rate': 1.277087344652547e-05, 'epoch': 36.15}
{'loss': 0.0046, 'grad_norm': 0.24996282160282135, 'learning_rate': 1.2753369508139333e-05, 'epoch': 36.23}
{'loss': 0.0049, 'grad_norm': 0.7339895963668823, 'learning_rate': 1.2735865569753196e-05, 'epoch': 36.32}
{'loss': 0.0049, 'grad_norm': 0.027808157727122307, 'learning_rate': 1.2718361631367058e-05, 'epoch': 36.41}
{'loss': 0.005, 'grad_norm': 0.11755041033029556, 'learning_rate': 1.2700857692980922e-05, 'epoch': 36.5}
{'loss': 0.0049, 'grad_norm': 0.40108543634414673, 'learning_rate': 1.

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.013815631158649921, 'eval_accuracy': 0.7428234856469713, 'eval_f1': 0.846959409472767, 'eval_precision_1': 0.9016416395393608, 'eval_recall': 0.7985305970611941, 'eval_runtime': 40.1755, 'eval_samples_per_second': 401.464, 'eval_steps_per_second': 25.115, 'epoch': 37.0}
{'loss': 0.005, 'grad_norm': 0.1646035760641098, 'learning_rate': 1.25958340626641e-05, 'epoch': 37.02}
{'loss': 0.0046, 'grad_norm': 0.2571295201778412, 'learning_rate': 1.2578330124277965e-05, 'epoch': 37.11}
{'loss': 0.0047, 'grad_norm': 0.11336588114500046, 'learning_rate': 1.2560826185891825e-05, 'epoch': 37.2}
{'loss': 0.0046, 'grad_norm': 0.3841153681278229, 'learning_rate': 1.254332224750569e-05, 'epoch': 37.28}
{'loss': 0.0047, 'grad_norm': 0.10296372324228287, 'learning_rate': 1.2525818309119553e-05, 'epoch': 37.37}
{'loss': 0.0047, 'grad_norm': 0.2725949287414551, 'learning_rate': 1.2508314370733416e-05, 'epoch': 37.46}
{'loss': 0.0048, 'grad_norm': 0.3650917410850525, 'learning_rate': 1.24908

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.013640027493238449, 'eval_accuracy': 0.7490854981709963, 'eval_f1': 0.850103140041256, 'eval_precision_1': 0.9007424368581738, 'eval_recall': 0.8048546097092194, 'eval_runtime': 40.3062, 'eval_samples_per_second': 400.162, 'eval_steps_per_second': 25.033, 'epoch': 38.0}
{'loss': 0.0046, 'grad_norm': 0.18303321301937103, 'learning_rate': 1.2385786802030457e-05, 'epoch': 38.07}
{'loss': 0.0044, 'grad_norm': 0.1497429758310318, 'learning_rate': 1.2368282863644321e-05, 'epoch': 38.16}
{'loss': 0.0043, 'grad_norm': 0.43655553460121155, 'learning_rate': 1.2350778925258185e-05, 'epoch': 38.25}
{'loss': 0.0044, 'grad_norm': 0.12496143579483032, 'learning_rate': 1.2333274986872047e-05, 'epoch': 38.33}
{'loss': 0.0049, 'grad_norm': 0.32252275943756104, 'learning_rate': 1.231577104848591e-05, 'epoch': 38.42}
{'loss': 0.0046, 'grad_norm': 0.17555227875709534, 'learning_rate': 1.2298267110099774e-05, 'epoch': 38.51}
{'loss': 0.0046, 'grad_norm': 0.3356007933616638, 'learning_rate': 

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.013988392427563667, 'eval_accuracy': 0.7539215078430157, 'eval_f1': 0.8490224534055147, 'eval_precision_1': 0.8955932436616326, 'eval_recall': 0.8070556141112282, 'eval_runtime': 40.0529, 'eval_samples_per_second': 402.692, 'eval_steps_per_second': 25.192, 'epoch': 39.0}
{'loss': 0.0045, 'grad_norm': 0.2628023326396942, 'learning_rate': 1.2193243479782953e-05, 'epoch': 39.03}
{'loss': 0.0042, 'grad_norm': 0.03967685624957085, 'learning_rate': 1.2175739541396817e-05, 'epoch': 39.12}
{'loss': 0.0042, 'grad_norm': 0.6303163170814514, 'learning_rate': 1.2158235603010677e-05, 'epoch': 39.21}
{'loss': 0.0045, 'grad_norm': 0.2170306146144867, 'learning_rate': 1.2140731664624541e-05, 'epoch': 39.3}
{'loss': 0.0045, 'grad_norm': 0.0571117028594017, 'learning_rate': 1.2123227726238405e-05, 'epoch': 39.38}
{'loss': 0.0044, 'grad_norm': 0.29009419679641724, 'learning_rate': 1.2105723787852268e-05, 'epoch': 39.47}
{'loss': 0.0046, 'grad_norm': 0.7803731560707092, 'learning_rate': 1.

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.0140338484197855, 'eval_accuracy': 0.7536115072230144, 'eval_f1': 0.8498646400730617, 'eval_precision_1': 0.8966276668960771, 'eval_recall': 0.8077376154752309, 'eval_runtime': 40.0654, 'eval_samples_per_second': 402.567, 'eval_steps_per_second': 25.184, 'epoch': 40.0}
{'loss': 0.0041, 'grad_norm': 0.06704460829496384, 'learning_rate': 1.1983196219149309e-05, 'epoch': 40.08}
{'loss': 0.0043, 'grad_norm': 0.13657470047473907, 'learning_rate': 1.1965692280763173e-05, 'epoch': 40.17}
{'loss': 0.004, 'grad_norm': 0.20096683502197266, 'learning_rate': 1.1948188342377037e-05, 'epoch': 40.26}
{'loss': 0.0041, 'grad_norm': 0.053506262600421906, 'learning_rate': 1.19306844039909e-05, 'epoch': 40.35}
{'loss': 0.0045, 'grad_norm': 0.16554002463817596, 'learning_rate': 1.1913180465604762e-05, 'epoch': 40.43}
{'loss': 0.0045, 'grad_norm': 0.09361979365348816, 'learning_rate': 1.1895676527218626e-05, 'epoch': 40.52}
{'loss': 0.0043, 'grad_norm': 0.3071986734867096, 'learning_rate': 1

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.014088280498981476, 'eval_accuracy': 0.7572075144150289, 'eval_f1': 0.8514301492682769, 'eval_precision_1': 0.8942645603739465, 'eval_recall': 0.81251162502325, 'eval_runtime': 40.0203, 'eval_samples_per_second': 403.02, 'eval_steps_per_second': 25.212, 'epoch': 41.0}
{'loss': 0.0045, 'grad_norm': 0.12962408363819122, 'learning_rate': 1.1790652896901804e-05, 'epoch': 41.05}
{'loss': 0.0041, 'grad_norm': 0.20949162542819977, 'learning_rate': 1.1773148958515668e-05, 'epoch': 41.13}
{'loss': 0.0042, 'grad_norm': 0.22756953537464142, 'learning_rate': 1.1755645020129529e-05, 'epoch': 41.22}
{'loss': 0.0041, 'grad_norm': 0.051337264478206635, 'learning_rate': 1.1738141081743393e-05, 'epoch': 41.31}
{'loss': 0.004, 'grad_norm': 0.3235204517841339, 'learning_rate': 1.1720637143357257e-05, 'epoch': 41.4}
{'loss': 0.0041, 'grad_norm': 0.3698206841945648, 'learning_rate': 1.170313320497112e-05, 'epoch': 41.48}
{'loss': 0.0043, 'grad_norm': 0.11314024776220322, 'learning_rate': 1.1

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.013972001150250435, 'eval_accuracy': 0.758013516027032, 'eval_f1': 0.8511844634885464, 'eval_precision_1': 0.8970467032967033, 'eval_recall': 0.8097836195672391, 'eval_runtime': 39.9479, 'eval_samples_per_second': 403.751, 'eval_steps_per_second': 25.258, 'epoch': 42.0}
{'loss': 0.0043, 'grad_norm': 1.1000299453735352, 'learning_rate': 1.1598109574654297e-05, 'epoch': 42.01}
{'loss': 0.0039, 'grad_norm': 0.22412128746509552, 'learning_rate': 1.158060563626816e-05, 'epoch': 42.1}
{'loss': 0.0038, 'grad_norm': 0.14652962982654572, 'learning_rate': 1.1563101697882025e-05, 'epoch': 42.18}
{'loss': 0.0038, 'grad_norm': 0.06745512038469315, 'learning_rate': 1.1545597759495889e-05, 'epoch': 42.27}
{'loss': 0.004, 'grad_norm': 0.32516732811927795, 'learning_rate': 1.1528093821109751e-05, 'epoch': 42.36}
{'loss': 0.0042, 'grad_norm': 0.10553308576345444, 'learning_rate': 1.1510589882723613e-05, 'epoch': 42.45}
{'loss': 0.0041, 'grad_norm': 0.07629158347845078, 'learning_rate': 1

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.014064098708331585, 'eval_accuracy': 0.7640275280550561, 'eval_f1': 0.8536410923276984, 'eval_precision_1': 0.8973412617045998, 'eval_recall': 0.813999627999256, 'eval_runtime': 40.2406, 'eval_samples_per_second': 400.814, 'eval_steps_per_second': 25.074, 'epoch': 43.0}
{'loss': 0.0038, 'grad_norm': 0.14849305152893066, 'learning_rate': 1.1388062314020656e-05, 'epoch': 43.06}
{'loss': 0.0037, 'grad_norm': 0.17045283317565918, 'learning_rate': 1.1370558375634519e-05, 'epoch': 43.15}
{'loss': 0.0037, 'grad_norm': 0.34576526284217834, 'learning_rate': 1.1353054437248381e-05, 'epoch': 43.23}
{'loss': 0.0039, 'grad_norm': 0.11048150807619095, 'learning_rate': 1.1335550498862245e-05, 'epoch': 43.32}
{'loss': 0.004, 'grad_norm': 0.036001745611429214, 'learning_rate': 1.1318046560476107e-05, 'epoch': 43.41}
{'loss': 0.0041, 'grad_norm': 0.5244663953781128, 'learning_rate': 1.1300542622089971e-05, 'epoch': 43.5}
{'loss': 0.0041, 'grad_norm': 0.3488143980503082, 'learning_rate': 

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.013750358484685421, 'eval_accuracy': 0.7657015314030629, 'eval_f1': 0.8566220746671874, 'eval_precision_1': 0.9016719199671098, 'eval_recall': 0.8158596317192635, 'eval_runtime': 40.0915, 'eval_samples_per_second': 402.304, 'eval_steps_per_second': 25.167, 'epoch': 44.0}
{'loss': 0.0039, 'grad_norm': 0.06552322953939438, 'learning_rate': 1.1195518991773149e-05, 'epoch': 44.02}
{'loss': 0.0035, 'grad_norm': 0.14054937660694122, 'learning_rate': 1.1178015053387013e-05, 'epoch': 44.11}
{'loss': 0.0037, 'grad_norm': 0.0683925449848175, 'learning_rate': 1.1160511115000877e-05, 'epoch': 44.2}
{'loss': 0.0038, 'grad_norm': 0.1599508374929428, 'learning_rate': 1.1143007176614739e-05, 'epoch': 44.28}
{'loss': 0.0037, 'grad_norm': 0.12838895618915558, 'learning_rate': 1.1125503238228603e-05, 'epoch': 44.37}
{'loss': 0.0039, 'grad_norm': 0.36960989236831665, 'learning_rate': 1.1107999299842465e-05, 'epoch': 44.46}
{'loss': 0.0039, 'grad_norm': 0.13163810968399048, 'learning_rate':

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.01388411782681942, 'eval_accuracy': 0.767747535495071, 'eval_f1': 0.8575236052199633, 'eval_precision_1': 0.9012126387702818, 'eval_recall': 0.8178746357492716, 'eval_runtime': 40.3008, 'eval_samples_per_second': 400.215, 'eval_steps_per_second': 25.037, 'epoch': 45.0}
{'loss': 0.0036, 'grad_norm': 0.10022368282079697, 'learning_rate': 1.0985471731139508e-05, 'epoch': 45.07}
{'loss': 0.0036, 'grad_norm': 0.15404342114925385, 'learning_rate': 1.096796779275337e-05, 'epoch': 45.16}
{'loss': 0.0037, 'grad_norm': 0.5439798831939697, 'learning_rate': 1.0950463854367233e-05, 'epoch': 45.25}
{'loss': 0.0037, 'grad_norm': 0.414844810962677, 'learning_rate': 1.0932959915981097e-05, 'epoch': 45.34}
{'loss': 0.0039, 'grad_norm': 0.44651126861572266, 'learning_rate': 1.0915455977594959e-05, 'epoch': 45.42}
{'loss': 0.0038, 'grad_norm': 0.28239545226097107, 'learning_rate': 1.0897952039208823e-05, 'epoch': 45.51}
{'loss': 0.0037, 'grad_norm': 0.31652092933654785, 'learning_rate': 1.

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.014216024428606033, 'eval_accuracy': 0.7688015376030752, 'eval_f1': 0.8548870839615453, 'eval_precision_1': 0.8960407816482583, 'eval_recall': 0.8173476346952694, 'eval_runtime': 40.1004, 'eval_samples_per_second': 402.215, 'eval_steps_per_second': 25.162, 'epoch': 46.0}
{'loss': 0.0037, 'grad_norm': 0.12671411037445068, 'learning_rate': 1.0792928408892e-05, 'epoch': 46.04}
{'loss': 0.0036, 'grad_norm': 1.0252798795700073, 'learning_rate': 1.0775424470505864e-05, 'epoch': 46.12}
{'loss': 0.0037, 'grad_norm': 0.340760201215744, 'learning_rate': 1.0757920532119728e-05, 'epoch': 46.21}
{'loss': 0.0035, 'grad_norm': 0.0739973708987236, 'learning_rate': 1.074041659373359e-05, 'epoch': 46.3}
{'loss': 0.0038, 'grad_norm': 0.297688364982605, 'learning_rate': 1.0722912655347455e-05, 'epoch': 46.39}
{'loss': 0.0036, 'grad_norm': 0.07341129332780838, 'learning_rate': 1.0705408716961317e-05, 'epoch': 46.47}
{'loss': 0.004, 'grad_norm': 0.772711992263794, 'learning_rate': 1.06879047

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.014134732075035572, 'eval_accuracy': 0.767003534007068, 'eval_f1': 0.8557643886352929, 'eval_precision_1': 0.8989454284836695, 'eval_recall': 0.8165416330832662, 'eval_runtime': 40.1237, 'eval_samples_per_second': 401.982, 'eval_steps_per_second': 25.147, 'epoch': 47.0}
{'loss': 0.0035, 'grad_norm': 0.015315007418394089, 'learning_rate': 1.058288114825836e-05, 'epoch': 47.09}
{'loss': 0.0036, 'grad_norm': 0.07064381241798401, 'learning_rate': 1.0565377209872222e-05, 'epoch': 47.17}
{'loss': 0.0034, 'grad_norm': 0.5602666139602661, 'learning_rate': 1.0547873271486085e-05, 'epoch': 47.26}
{'loss': 0.0036, 'grad_norm': 0.20111429691314697, 'learning_rate': 1.0530369333099949e-05, 'epoch': 47.35}
{'loss': 0.0034, 'grad_norm': 0.3807985186576843, 'learning_rate': 1.0512865394713811e-05, 'epoch': 47.44}
{'loss': 0.0037, 'grad_norm': 0.33500948548316956, 'learning_rate': 1.0495361456327675e-05, 'epoch': 47.52}
{'loss': 0.0035, 'grad_norm': 0.05552535876631737, 'learning_rate':

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.014233659952878952, 'eval_accuracy': 0.7656395312790626, 'eval_f1': 0.8548736227072246, 'eval_precision_1': 0.899880075381189, 'eval_recall': 0.8141546283092567, 'eval_runtime': 40.0329, 'eval_samples_per_second': 402.894, 'eval_steps_per_second': 25.204, 'epoch': 48.0}
{'loss': 0.0036, 'grad_norm': 0.13058805465698242, 'learning_rate': 1.0390337826010852e-05, 'epoch': 48.05}
{'loss': 0.0035, 'grad_norm': 0.19395993649959564, 'learning_rate': 1.0372833887624716e-05, 'epoch': 48.14}
{'loss': 0.0034, 'grad_norm': 0.09157435595989227, 'learning_rate': 1.035532994923858e-05, 'epoch': 48.22}
{'loss': 0.0034, 'grad_norm': 0.025417551398277283, 'learning_rate': 1.0337826010852443e-05, 'epoch': 48.31}
{'loss': 0.0034, 'grad_norm': 0.4553525745868683, 'learning_rate': 1.0320322072466307e-05, 'epoch': 48.4}
{'loss': 0.0034, 'grad_norm': 0.5003842711448669, 'learning_rate': 1.0302818134080169e-05, 'epoch': 48.49}
{'loss': 0.0035, 'grad_norm': 0.09364085644483566, 'learning_rate': 

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.014179429039359093, 'eval_accuracy': 0.7691735383470767, 'eval_f1': 0.8565247096333409, 'eval_precision_1': 0.9011158269441402, 'eval_recall': 0.8161386322772646, 'eval_runtime': 40.0037, 'eval_samples_per_second': 403.188, 'eval_steps_per_second': 25.223, 'epoch': 49.0}
{'loss': 0.0036, 'grad_norm': 0.20290234684944153, 'learning_rate': 1.0197794503763348e-05, 'epoch': 49.01}
{'loss': 0.0032, 'grad_norm': 0.1578168123960495, 'learning_rate': 1.0180290565377212e-05, 'epoch': 49.1}
{'loss': 0.0034, 'grad_norm': 0.08265580236911774, 'learning_rate': 1.0162786626991074e-05, 'epoch': 49.19}
{'loss': 0.0034, 'grad_norm': 0.08964233100414276, 'learning_rate': 1.0145282688604937e-05, 'epoch': 49.27}
{'loss': 0.0035, 'grad_norm': 0.08024994283914566, 'learning_rate': 1.0127778750218799e-05, 'epoch': 49.36}
{'loss': 0.0035, 'grad_norm': 1.3267207145690918, 'learning_rate': 1.0110274811832663e-05, 'epoch': 49.45}
{'loss': 0.0035, 'grad_norm': 0.10225451737642288, 'learning_rate':

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.014750145375728607, 'eval_accuracy': 0.7693595387190775, 'eval_f1': 0.8536557582464077, 'eval_precision_1': 0.8984688110163396, 'eval_recall': 0.8131006262012525, 'eval_runtime': 40.0071, 'eval_samples_per_second': 403.153, 'eval_steps_per_second': 25.221, 'epoch': 50.0}
{'loss': 0.0034, 'grad_norm': 0.07507941871881485, 'learning_rate': 9.987747243129706e-06, 'epoch': 50.06}
{'loss': 0.0032, 'grad_norm': 0.19683738052845, 'learning_rate': 9.970243304743568e-06, 'epoch': 50.15}
{'loss': 0.0032, 'grad_norm': 0.0672985091805458, 'learning_rate': 9.95273936635743e-06, 'epoch': 50.24}
{'loss': 0.0032, 'grad_norm': 0.21262413263320923, 'learning_rate': 9.935235427971294e-06, 'epoch': 50.32}
{'loss': 0.0034, 'grad_norm': 0.3406096398830414, 'learning_rate': 9.917731489585157e-06, 'epoch': 50.41}
{'loss': 0.0034, 'grad_norm': 0.11720714718103409, 'learning_rate': 9.90022755119902e-06, 'epoch': 50.5}
{'loss': 0.0033, 'grad_norm': 0.06961672008037567, 'learning_rate': 9.88272361

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.014391845092177391, 'eval_accuracy': 0.7694835389670779, 'eval_f1': 0.8565117827619249, 'eval_precision_1': 0.8995053726761044, 'eval_recall': 0.8174406348812697, 'eval_runtime': 39.896, 'eval_samples_per_second': 404.276, 'eval_steps_per_second': 25.291, 'epoch': 51.0}
{'loss': 0.0036, 'grad_norm': 0.06931164115667343, 'learning_rate': 9.7952039208822e-06, 'epoch': 51.02}
{'loss': 0.0031, 'grad_norm': 0.050825104117393494, 'learning_rate': 9.777699982496062e-06, 'epoch': 51.11}
{'loss': 0.0032, 'grad_norm': 0.049970611929893494, 'learning_rate': 9.760196044109926e-06, 'epoch': 51.2}
{'loss': 0.0034, 'grad_norm': 0.04195016995072365, 'learning_rate': 9.742692105723788e-06, 'epoch': 51.29}
{'loss': 0.0034, 'grad_norm': 1.5335358381271362, 'learning_rate': 9.72518816733765e-06, 'epoch': 51.37}
{'loss': 0.0034, 'grad_norm': 0.2427739053964615, 'learning_rate': 9.707684228951515e-06, 'epoch': 51.46}
{'loss': 0.0034, 'grad_norm': 0.13680024445056915, 'learning_rate': 9.69018

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.014639426954090595, 'eval_accuracy': 0.775745551491103, 'eval_f1': 0.8574201374848363, 'eval_precision_1': 0.8964385970845875, 'eval_recall': 0.8216566433132866, 'eval_runtime': 40.0349, 'eval_samples_per_second': 402.873, 'eval_steps_per_second': 25.203, 'epoch': 52.0}
{'loss': 0.0031, 'grad_norm': 0.029957111924886703, 'learning_rate': 9.585156660248558e-06, 'epoch': 52.07}
{'loss': 0.003, 'grad_norm': 0.07489196211099625, 'learning_rate': 9.56765272186242e-06, 'epoch': 52.16}
{'loss': 0.0033, 'grad_norm': 0.530509352684021, 'learning_rate': 9.550148783476282e-06, 'epoch': 52.25}
{'loss': 0.0032, 'grad_norm': 0.022272232919931412, 'learning_rate': 9.532644845090146e-06, 'epoch': 52.34}
{'loss': 0.0031, 'grad_norm': 0.08259742707014084, 'learning_rate': 9.515140906704009e-06, 'epoch': 52.42}
{'loss': 0.0032, 'grad_norm': 0.1107010766863823, 'learning_rate': 9.497636968317873e-06, 'epoch': 52.51}
{'loss': 0.0035, 'grad_norm': 0.06113993376493454, 'learning_rate': 9.4801

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.014270194806158543, 'eval_accuracy': 0.7759315518631037, 'eval_f1': 0.8585637686089647, 'eval_precision_1': 0.8991280154717878, 'eval_recall': 0.821501643003286, 'eval_runtime': 39.9765, 'eval_samples_per_second': 403.462, 'eval_steps_per_second': 25.24, 'epoch': 53.0}
{'loss': 0.0031, 'grad_norm': 0.04434647411108017, 'learning_rate': 9.392613338001052e-06, 'epoch': 53.04}
{'loss': 0.003, 'grad_norm': 0.05383562296628952, 'learning_rate': 9.375109399614914e-06, 'epoch': 53.12}
{'loss': 0.0032, 'grad_norm': 0.03548626974225044, 'learning_rate': 9.357605461228776e-06, 'epoch': 53.21}
{'loss': 0.0028, 'grad_norm': 0.10344108194112778, 'learning_rate': 9.34010152284264e-06, 'epoch': 53.3}
{'loss': 0.0033, 'grad_norm': 0.09676837921142578, 'learning_rate': 9.322597584456503e-06, 'epoch': 53.39}
{'loss': 0.0032, 'grad_norm': 0.22136034071445465, 'learning_rate': 9.305093646070367e-06, 'epoch': 53.47}
{'loss': 0.0032, 'grad_norm': 0.08096284419298172, 'learning_rate': 9.28758

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.014489918947219849, 'eval_accuracy': 0.7732035464070928, 'eval_f1': 0.8567764140528605, 'eval_precision_1': 0.8994137296339219, 'eval_recall': 0.817998635997272, 'eval_runtime': 40.2152, 'eval_samples_per_second': 401.067, 'eval_steps_per_second': 25.09, 'epoch': 54.0}
{'loss': 0.003, 'grad_norm': 0.0652565062046051, 'learning_rate': 9.182566077367408e-06, 'epoch': 54.09}
{'loss': 0.0032, 'grad_norm': 0.22517110407352448, 'learning_rate': 9.165062138981272e-06, 'epoch': 54.17}
{'loss': 0.0031, 'grad_norm': 0.11782388389110565, 'learning_rate': 9.147558200595134e-06, 'epoch': 54.26}
{'loss': 0.0031, 'grad_norm': 0.745867133140564, 'learning_rate': 9.130054262208998e-06, 'epoch': 54.35}
{'loss': 0.0031, 'grad_norm': 0.08118545264005661, 'learning_rate': 9.11255032382286e-06, 'epoch': 54.44}
{'loss': 0.0032, 'grad_norm': 0.1454887092113495, 'learning_rate': 9.095046385436724e-06, 'epoch': 54.52}
{'loss': 0.0032, 'grad_norm': 0.05161270126700401, 'learning_rate': 9.07754244

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.014431683346629143, 'eval_accuracy': 0.7761795523591047, 'eval_f1': 0.8589591813567293, 'eval_precision_1': 0.9006291447032817, 'eval_recall': 0.8209746419492839, 'eval_runtime': 40.0682, 'eval_samples_per_second': 402.539, 'eval_steps_per_second': 25.182, 'epoch': 55.0}
{'loss': 0.0032, 'grad_norm': 1.1825436353683472, 'learning_rate': 8.990022755119903e-06, 'epoch': 55.05}
{'loss': 0.0028, 'grad_norm': 0.06571368128061295, 'learning_rate': 8.972518816733766e-06, 'epoch': 55.14}
{'loss': 0.003, 'grad_norm': 0.16459843516349792, 'learning_rate': 8.955014878347628e-06, 'epoch': 55.22}
{'loss': 0.0031, 'grad_norm': 0.19201861321926117, 'learning_rate': 8.937510939961492e-06, 'epoch': 55.31}
{'loss': 0.0032, 'grad_norm': 0.18720081448554993, 'learning_rate': 8.920007001575354e-06, 'epoch': 55.4}
{'loss': 0.0032, 'grad_norm': 0.06475222110748291, 'learning_rate': 8.902503063189218e-06, 'epoch': 55.49}
{'loss': 0.0029, 'grad_norm': 0.18401861190795898, 'learning_rate': 8.884

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.014473369345068932, 'eval_accuracy': 0.7771095542191084, 'eval_f1': 0.8583951995083055, 'eval_precision_1': 0.8974263586864621, 'eval_recall': 0.8226176452352905, 'eval_runtime': 40.0908, 'eval_samples_per_second': 402.312, 'eval_steps_per_second': 25.168, 'epoch': 56.0}
{'loss': 0.0032, 'grad_norm': 0.05007682368159294, 'learning_rate': 8.797479432872397e-06, 'epoch': 56.01}
{'loss': 0.0029, 'grad_norm': 0.44898203015327454, 'learning_rate': 8.77997549448626e-06, 'epoch': 56.1}
{'loss': 0.003, 'grad_norm': 0.16730082035064697, 'learning_rate': 8.762471556100122e-06, 'epoch': 56.19}
{'loss': 0.003, 'grad_norm': 0.1459662914276123, 'learning_rate': 8.744967617713986e-06, 'epoch': 56.28}
{'loss': 0.0029, 'grad_norm': 0.3209545314311981, 'learning_rate': 8.72746367932785e-06, 'epoch': 56.36}
{'loss': 0.0029, 'grad_norm': 0.3064553439617157, 'learning_rate': 8.709959740941712e-06, 'epoch': 56.45}
{'loss': 0.0031, 'grad_norm': 0.062224697321653366, 'learning_rate': 8.6924558

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.014588994905352592, 'eval_accuracy': 0.7750015500031, 'eval_f1': 0.8573467170447727, 'eval_precision_1': 0.898874494202455, 'eval_recall': 0.8194866389732779, 'eval_runtime': 40.29, 'eval_samples_per_second': 400.323, 'eval_steps_per_second': 25.043, 'epoch': 57.0}
{'loss': 0.0029, 'grad_norm': 0.05476026237010956, 'learning_rate': 8.587432172238754e-06, 'epoch': 57.06}
{'loss': 0.0029, 'grad_norm': 0.0715697854757309, 'learning_rate': 8.569928233852618e-06, 'epoch': 57.15}
{'loss': 0.0028, 'grad_norm': 0.0477752648293972, 'learning_rate': 8.55242429546648e-06, 'epoch': 57.24}
{'loss': 0.0031, 'grad_norm': 0.7181854248046875, 'learning_rate': 8.534920357080344e-06, 'epoch': 57.33}
{'loss': 0.0032, 'grad_norm': 0.07633545994758606, 'learning_rate': 8.517416418694206e-06, 'epoch': 57.41}
{'loss': 0.0031, 'grad_norm': 0.08010213822126389, 'learning_rate': 8.49991248030807e-06, 'epoch': 57.5}
{'loss': 0.0029, 'grad_norm': 0.0634050965309143, 'learning_rate': 8.4824085419219

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.01489955186843872, 'eval_accuracy': 0.7777295554591109, 'eval_f1': 0.8560670943554772, 'eval_precision_1': 0.8941097046413502, 'eval_recall': 0.8211296422592845, 'eval_runtime': 40.0545, 'eval_samples_per_second': 402.676, 'eval_steps_per_second': 25.191, 'epoch': 58.0}
{'loss': 0.0031, 'grad_norm': 0.026281848549842834, 'learning_rate': 8.39488884999125e-06, 'epoch': 58.03}
{'loss': 0.0028, 'grad_norm': 0.0683426484465599, 'learning_rate': 8.377384911605111e-06, 'epoch': 58.11}
{'loss': 0.0029, 'grad_norm': 0.050952304154634476, 'learning_rate': 8.359880973218974e-06, 'epoch': 58.2}
{'loss': 0.0029, 'grad_norm': 0.10697763413190842, 'learning_rate': 8.342377034832838e-06, 'epoch': 58.29}
{'loss': 0.003, 'grad_norm': 0.025868479162454605, 'learning_rate': 8.324873096446702e-06, 'epoch': 58.38}
{'loss': 0.0031, 'grad_norm': 0.06494490057229996, 'learning_rate': 8.307369158060564e-06, 'epoch': 58.46}
{'loss': 0.003, 'grad_norm': 0.1449962556362152, 'learning_rate': 8.2898

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.014937774278223515, 'eval_accuracy': 0.7802715605431211, 'eval_f1': 0.8570275809757436, 'eval_precision_1': 0.8938562531560343, 'eval_recall': 0.8231136462272924, 'eval_runtime': 40.2242, 'eval_samples_per_second': 400.978, 'eval_steps_per_second': 25.084, 'epoch': 59.0}
{'loss': 0.003, 'grad_norm': 0.10409174859523773, 'learning_rate': 8.184841589357605e-06, 'epoch': 59.08}
{'loss': 0.0028, 'grad_norm': 0.0876464694738388, 'learning_rate': 8.16733765097147e-06, 'epoch': 59.16}
{'loss': 0.0029, 'grad_norm': 0.05672736093401909, 'learning_rate': 8.149833712585332e-06, 'epoch': 59.25}
{'loss': 0.0029, 'grad_norm': 0.055726706981658936, 'learning_rate': 8.132329774199196e-06, 'epoch': 59.34}
{'loss': 0.0027, 'grad_norm': 0.06469126790761948, 'learning_rate': 8.114825835813058e-06, 'epoch': 59.43}
{'loss': 0.003, 'grad_norm': 0.04590517282485962, 'learning_rate': 8.097321897426922e-06, 'epoch': 59.51}
{'loss': 0.0029, 'grad_norm': 0.03541639819741249, 'learning_rate': 8.079

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.014835634268820286, 'eval_accuracy': 0.7756215512431025, 'eval_f1': 0.8572493839968875, 'eval_precision_1': 0.8984369690791709, 'eval_recall': 0.8196726393452787, 'eval_runtime': 40.0875, 'eval_samples_per_second': 402.344, 'eval_steps_per_second': 25.17, 'epoch': 60.0}
{'loss': 0.0028, 'grad_norm': 0.07205217331647873, 'learning_rate': 7.9922982671101e-06, 'epoch': 60.04}
{'loss': 0.0028, 'grad_norm': 0.0725032389163971, 'learning_rate': 7.974794328723963e-06, 'epoch': 60.13}
{'loss': 0.003, 'grad_norm': 0.026917874813079834, 'learning_rate': 7.957290390337826e-06, 'epoch': 60.21}
{'loss': 0.0028, 'grad_norm': 0.08156539499759674, 'learning_rate': 7.93978645195169e-06, 'epoch': 60.3}
{'loss': 0.003, 'grad_norm': 0.10812270641326904, 'learning_rate': 7.922282513565554e-06, 'epoch': 60.39}
{'loss': 0.0027, 'grad_norm': 0.0690542608499527, 'learning_rate': 7.904778575179416e-06, 'epoch': 60.48}
{'loss': 0.003, 'grad_norm': 0.17710834741592407, 'learning_rate': 7.887274636

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.0148966358974576, 'eval_accuracy': 0.7777915555831112, 'eval_f1': 0.8579094538460296, 'eval_precision_1': 0.8962547701867549, 'eval_recall': 0.8227106454212908, 'eval_runtime': 40.0029, 'eval_samples_per_second': 403.195, 'eval_steps_per_second': 25.223, 'epoch': 61.0}
{'loss': 0.0028, 'grad_norm': 0.07926646620035172, 'learning_rate': 7.799754944862595e-06, 'epoch': 61.0}
{'loss': 0.0027, 'grad_norm': 0.7245857119560242, 'learning_rate': 7.782251006476457e-06, 'epoch': 61.09}
{'loss': 0.0029, 'grad_norm': 0.07150136679410934, 'learning_rate': 7.764747068090321e-06, 'epoch': 61.18}
{'loss': 0.0027, 'grad_norm': 0.06456363201141357, 'learning_rate': 7.747243129704184e-06, 'epoch': 61.26}
{'loss': 0.0028, 'grad_norm': 0.026240836828947067, 'learning_rate': 7.729739191318048e-06, 'epoch': 61.35}
{'loss': 0.003, 'grad_norm': 0.0442662313580513, 'learning_rate': 7.71223525293191e-06, 'epoch': 61.44}
{'loss': 0.0029, 'grad_norm': 0.11813411116600037, 'learning_rate': 7.694731

  0%|          | 0/1009 [00:00<?, ?it/s]

{'eval_loss': 0.01483156718313694, 'eval_accuracy': 0.7746915493830988, 'eval_f1': 0.8579020707087074, 'eval_precision_1': 0.8986422267481331, 'eval_recall': 0.8206956413912828, 'eval_runtime': 40.035, 'eval_samples_per_second': 402.872, 'eval_steps_per_second': 25.203, 'epoch': 62.0}
{'train_runtime': 43526.8189, 'train_samples_per_second': 209.976, 'train_steps_per_second': 13.125, 'train_loss': 0.013663096298290512, 'epoch': 62.0}


TrainOutput(global_step=354206, training_loss=0.013663096298290512, metrics={'train_runtime': 43526.8189, 'train_samples_per_second': 209.976, 'train_steps_per_second': 13.125, 'total_flos': 3.732920048296366e+17, 'train_loss': 0.013663096298290512, 'epoch': 62.0})

## Test Outputs

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import json

# Load the trained model and tokenizer
model = BertForSequenceClassification.from_pretrained("/home/j/Desktop/MLotsawa/Notebooks/Models/BertTag/bert-classifier/checkpoint-34275")
tokenizer = BertTokenizer.from_pretrained("./tibetan_tokenizer")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Load label mapping
with open("label_mapping.json", "r") as f:
    label_mapping = json.load(f)

# Example input text
input_text = "བླ་མ་དང་ལྷག་པའི་ལྷ་ལ་ཕྱག་འཚལ་ལོ།།"

# Tokenize input
encoded_input = tokenizer(
    input_text, 
    padding="max_length", 
    truncation=True, 
    max_length=128, 
    return_tensors="pt"
)
encoded_input = {key: val.to(device) for key, val in encoded_input.items()}

# Get predictions
with torch.no_grad():
    outputs = model(**encoded_input)
    logits = outputs.logits
    probabilities = torch.sigmoid(logits).cpu().numpy()
    predictions = (probabilities > 0.5).astype(int)

# Decode predictions
predicted_tags = [label_mapping[i] for i, val in enumerate(predictions[0]) if val == 1]

print("Predicted Tags:", predicted_tags)


OSError: Incorrect path_or_model_id: '/home/j/Desktop/MLotsawa/Notebooks/Models/BertTag/bert-classifier/checkpoint-34275'. Please provide either the path to a local folder or the repo_id of a model on the Hub.