# NLU Group 1: Final Project
## EXPERIMENT 1: Fine-tuning a classifier on the Social Bias Inference Corpus

Based on: https://github.com/huggingface/notebooks/blob/master/examples/text_classification.ipynb

This notebook by: Cameron Clarke (ccc779@nyu.edu)

Created: 3/16/2021

# 0. Setup

In [1]:
!pip install transformers
!pip install datasets
!pip install bert_score
!pip install optuna

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


Defaulting to user installation because normal site-packages is not writeable


# 1. Preprocessing

In [2]:
from transformers import AutoTokenizer
import datasets
from datasets import load_dataset, load_metric
import numpy as np
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [3]:
# [TODO] update this for whichever model type is desired

model_checkpoint = "bert-base-cased"
batch_size = 8

In [4]:
# [TODO] update this for whichever dataset is desired

dataset = load_dataset("social_bias_frames")

Using custom data configuration default
Reusing dataset social_bias_frames (/home/ccc779/.cache/huggingface/datasets/social_bias_frames/default/0.0.0/7ccf5e07dabdba6791693ea27289996d4771f586aa88f1ff05c52645f2cfd41d)


In [5]:
metric = load_metric('bertscore')

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [7]:
sentence1_key = 'post'

In [8]:
def preprocess_function(examples, sentence1_key, sentence2_key=None):
    if sentence2_key is None:
        return tokenizer(examples[sentence1_key], truncation=True, padding=True)
    return tokenizer(examples[sentence1_key], examples[sentence2_key], truncation=True, padding=True)

In [9]:
label_colname = 'offensiveYN'

relabel_dict = {
    '0.0': 0, # not offensive
    '0.5': 1, # maybe offiensive
    '1.0': 2, # offensive
    '': None # missing value
}

relabel_func = lambda column: [relabel_dict[elt] for elt in column]

In [10]:
# Following a design pattern found here: https://huggingface.co/transformers/custom_datasets.html#using-the-nlp-datasets-metrics-library


encoded_dataset = dataset.map(lambda x: preprocess_function(x, sentence1_key),  batched=True)
encoded_dataset = encoded_dataset.map(lambda x: {'labels': relabel_func(x[label_colname])},  batched=True)

new_features = encoded_dataset['train'].features.copy()
new_features["labels"] = datasets.ClassLabel(names=['no', 'maybe', 'yes'])

encoded_dataset['train'] = encoded_dataset['train'].cast(new_features)
encoded_dataset['validation'] = encoded_dataset['validation'].cast(new_features)
encoded_dataset['test'] = encoded_dataset['test'].cast(new_features)



Loading cached processed dataset at /home/ccc779/.cache/huggingface/datasets/social_bias_frames/default/0.0.0/7ccf5e07dabdba6791693ea27289996d4771f586aa88f1ff05c52645f2cfd41d/cache-d08b0826507a90dc.arrow
Loading cached processed dataset at /home/ccc779/.cache/huggingface/datasets/social_bias_frames/default/0.0.0/7ccf5e07dabdba6791693ea27289996d4771f586aa88f1ff05c52645f2cfd41d/cache-0bc1bf99c5c6c8fc.arrow
Loading cached processed dataset at /home/ccc779/.cache/huggingface/datasets/social_bias_frames/default/0.0.0/7ccf5e07dabdba6791693ea27289996d4771f586aa88f1ff05c52645f2cfd41d/cache-661e0889db4258ce.arrow
Loading cached processed dataset at /home/ccc779/.cache/huggingface/datasets/social_bias_frames/default/0.0.0/7ccf5e07dabdba6791693ea27289996d4771f586aa88f1ff05c52645f2cfd41d/cache-1dbc0410cc91c3ea.arrow
Loading cached processed dataset at /home/ccc779/.cache/huggingface/datasets/social_bias_frames/default/0.0.0/7ccf5e07dabdba6791693ea27289996d4771f586aa88f1ff05c52645f2cfd41d/cache-eb9

In [11]:
encoded_dataset = encoded_dataset.filter(lambda row: not (row['labels'] is None))

Loading cached processed dataset at /home/ccc779/.cache/huggingface/datasets/social_bias_frames/default/0.0.0/7ccf5e07dabdba6791693ea27289996d4771f586aa88f1ff05c52645f2cfd41d/cache-528f4c5949901aea.arrow
Loading cached processed dataset at /home/ccc779/.cache/huggingface/datasets/social_bias_frames/default/0.0.0/7ccf5e07dabdba6791693ea27289996d4771f586aa88f1ff05c52645f2cfd41d/cache-af55d434575f3e40.arrow
Loading cached processed dataset at /home/ccc779/.cache/huggingface/datasets/social_bias_frames/default/0.0.0/7ccf5e07dabdba6791693ea27289996d4771f586aa88f1ff05c52645f2cfd41d/cache-519c9e9c6225d5c4.arrow


# 2. Fine-tuning

In [12]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

In [13]:
# num_labels = 3 # The original 'offensiveYN' outcome is a categorical variable with three levels
# model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)
# model.eval()
# model.to('cuda')

In [14]:
# [TODO] update this with whatever metric is desired

metric_name = "accuracy"

args = TrainingArguments(
    "test-SBIC-bert",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
)

In [15]:
# Reference: https://github.com/armandalewis/ling-ga-1012/blob/main/eval_metrics.py

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # precision, recall, f1, _ = precision_recall_fscore_support(labels, preds)
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        # 'f1': f1,
        # 'precision': precision,
        # 'recall': recall
    }

# def compute_metrics(eval_pred, metric):
#     predictions, labels = eval_pred

#     predictions = np.argmax(predictions, axis=1)
#     return metric.compute(predictions=predictions, references=labels)

In [16]:
# validation_key = "validation"
# trainer = Trainer(
#     model,
#     args,
#     train_dataset=encoded_dataset['train'],
#     eval_dataset=encoded_dataset['validation'],
#     tokenizer=tokenizer,
#     compute_metrics=lambda x: compute_metrics(x, metric)
# )

# Hyperparameter Tuning

In [17]:
def model_init():
    num_labels = 3 # The original 'offensiveYN' outcome is a categorical variable with three levels
    model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)
    model.eval()
    model.to('cuda')
    return AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

In [18]:
trainer = Trainer(
    model_init=model_init,
    args=args,
    train_dataset=encoded_dataset['train'],
    eval_dataset=encoded_dataset['validation'],
    tokenizer=tokenizer,
    # compute_metrics=lambda x: compute_metrics(x, metric)
    compute_metrics=compute_metrics
)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [19]:
def my_hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 1e-2, log=True),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 1, 5),
        "seed": trial.suggest_int("seed", 1, 40),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [batch_size]),
    }

In [20]:
best_run = trainer.hyperparameter_search(n_trials=10, direction="maximize", hp_space=my_hp_space)

[32m[I 2021-04-06 18:23:57,182][0m A new study created in memory with name: no-name-408f9ed4-7597-494f-b64c-33a8f8c1cb4f[0m
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForS

Epoch,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
1,1.5318,1.874566,0.3477,64.4614,255.921
2,1.3241,1.471526,0.3477,64.431,256.041
3,1.1177,0.936534,0.573074,64.465,255.906
4,0.9534,0.899757,0.573074,64.0478,257.573


IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

[32m[I 2021-04-06 21:18:20,342][0m Trial 0 finished with value: 0.5730738922228283 and parameters: {'learning_rate': 0.007604930581926937, 'num_train_epochs': 4, 'seed': 37, 'per_device_train_batch_size': 8}. Best is trial 0 with value: 0.5730738922228283.[0m
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- 

Epoch,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
1,1.0602,0.901524,0.573074,64.2953,256.582
2,1.0165,0.896739,0.573074,68.4735,240.925
3,0.9312,0.905987,0.573074,65.0488,253.61


[32m[I 2021-04-06 23:30:44,291][0m Trial 1 finished with value: 0.5730738922228283 and parameters: {'learning_rate': 0.0024204774823914994, 'num_train_epochs': 3, 'seed': 15, 'per_device_train_batch_size': 8}. Best is trial 0 with value: 0.5730738922228283.[0m
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification f

Epoch,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
1,0.9403,0.898507,0.573074,64.2323,256.833


[32m[I 2021-04-07 00:14:19,705][0m Trial 2 finished with value: 0.5730738922228283 and parameters: {'learning_rate': 0.00013682797228095405, 'num_train_epochs': 1, 'seed': 8, 'per_device_train_batch_size': 8}. Best is trial 0 with value: 0.5730738922228283.[0m
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification f

Epoch,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
1,1.1565,1.153228,0.3477,63.8703,258.289
2,1.0563,0.901709,0.573074,63.8533,258.358
3,0.9857,0.891591,0.573074,64.3393,256.406
4,0.9372,0.899859,0.573074,65.9081,250.303


[32m[I 2021-04-07 03:09:13,546][0m Trial 3 finished with value: 0.5730738922228283 and parameters: {'learning_rate': 0.002107933305546124, 'num_train_epochs': 4, 'seed': 21, 'per_device_train_batch_size': 8}. Best is trial 0 with value: 0.5730738922228283.[0m
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification fr

Epoch,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
1,0.9458,0.895025,0.573074,66.6144,247.649


[32m[I 2021-04-07 03:53:54,286][0m Trial 4 finished with value: 0.5730738922228283 and parameters: {'learning_rate': 0.001881555118406718, 'num_train_epochs': 1, 'seed': 1, 'per_device_train_batch_size': 8}. Best is trial 0 with value: 0.5730738922228283.[0m
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification fro

Epoch,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
1,0.9636,0.890756,0.573074,65.9586,250.111
2,0.9502,0.913135,0.573074,66.3578,248.607
3,0.9342,0.918598,0.3477,65.5999,251.479
4,0.9335,0.900109,0.573074,66.4987,248.08
5,0.934,0.901576,0.573074,67.4226,244.681


[32m[I 2021-04-07 07:36:38,655][0m Trial 5 finished with value: 0.5730738922228283 and parameters: {'learning_rate': 0.0003395609846047122, 'num_train_epochs': 5, 'seed': 21, 'per_device_train_batch_size': 8}. Best is trial 0 with value: 0.5730738922228283.[0m
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification f

Epoch,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
1,0.9616,0.915448,0.573074,66.4947,248.095
2,0.9433,0.907768,0.573074,67.6692,243.789
3,0.9327,0.903139,0.573074,67.177,245.575


[32m[I 2021-04-07 09:51:06,824][0m Trial 6 finished with value: 0.5730738922228283 and parameters: {'learning_rate': 0.00044528480942815157, 'num_train_epochs': 3, 'seed': 18, 'per_device_train_batch_size': 8}. Best is trial 0 with value: 0.5730738922228283.[0m
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification 

Epoch,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
1,1.252,1.436709,0.3477,64.7256,254.876


[32m[I 2021-04-07 10:35:33,412][0m Trial 7 pruned. [0m
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassifi

Epoch,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
1,1.1066,1.125596,0.3477,67.0047,246.207


[32m[I 2021-04-07 11:26:14,295][0m Trial 8 pruned. [0m
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassifi

Epoch,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
1,0.9549,0.900838,0.573074,251.9025,65.49


[32m[I 2021-04-07 12:41:53,485][0m Trial 9 finished with value: 0.5730738922228283 and parameters: {'learning_rate': 0.002718603108561543, 'num_train_epochs': 1, 'seed': 39, 'per_device_train_batch_size': 8}. Best is trial 0 with value: 0.5730738922228283.[0m


In [21]:
for n, v in best_run.hyperparameters.items():
    setattr(trainer.args, n, v)

trainer.train()

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

Epoch,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
1,1.5318,1.874566,0.3477,64.5863,255.426
2,1.3241,1.471526,0.3477,64.4482,255.973
3,1.1177,0.936534,0.573074,64.1929,256.991
4,0.9534,0.899757,0.573074,64.1066,257.337


TrainOutput(global_step=55444, training_loss=1.3135140060337838, metrics={'train_runtime': 10425.3035, 'train_samples_per_second': 5.318, 'total_flos': 7.300386730149226e+16, 'epoch': 4.0, 'train_mem_cpu_alloc_delta': 232415232, 'train_mem_gpu_alloc_delta': -4085760, 'train_mem_cpu_peaked_delta': 330358784, 'train_mem_gpu_peaked_delta': 6076132352})

In [22]:
trainer.evaluate()

{'eval_loss': 0.9365341663360596,
 'eval_accuracy': 0.5730738922228283,
 'eval_runtime': 63.5509,
 'eval_samples_per_second': 259.587,
 'epoch': 4.0,
 'eval_mem_cpu_alloc_delta': 0,
 'eval_mem_gpu_alloc_delta': 0,
 'eval_mem_cpu_peaked_delta': 0,
 'eval_mem_gpu_peaked_delta': 204889088}

In [23]:
trainer.state

TrainerState(epoch=4.0, global_step=55444, max_steps=55444, num_train_epochs=4, total_flos=7.300386730149226e+16, log_history=[{'loss': 1.5871, 'learning_rate': 0.00753634849385675, 'epoch': 0.04, 'step': 500}, {'loss': 1.5515, 'learning_rate': 0.007467766405786563, 'epoch': 0.07, 'step': 1000}, {'loss': 1.6586, 'learning_rate': 0.007399184317716375, 'epoch': 0.11, 'step': 1500}, {'loss': 1.6187, 'learning_rate': 0.007330602229646189, 'epoch': 0.14, 'step': 2000}, {'loss': 1.6757, 'learning_rate': 0.007262020141576, 'epoch': 0.18, 'step': 2500}, {'loss': 1.7232, 'learning_rate': 0.0071934380535058125, 'epoch': 0.22, 'step': 3000}, {'loss': 1.6531, 'learning_rate': 0.007124855965435626, 'epoch': 0.25, 'step': 3500}, {'loss': 1.7654, 'learning_rate': 0.007056273877365438, 'epoch': 0.29, 'step': 4000}, {'loss': 1.5609, 'learning_rate': 0.0069876917892952506, 'epoch': 0.32, 'step': 4500}, {'loss': 1.5825, 'learning_rate': 0.006919109701225064, 'epoch': 0.36, 'step': 5000}, {'loss': 1.544, 