In [1]:
import pandas as pd
from datasets import load_dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset, DataLoader
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
# Load the dataset
dataset = load_dataset("rjac/kaggle-entity-annotated-corpus-ner-dataset")

pd.set_option('display.max_colwidth', None)
df = pd.DataFrame(dataset['train'])
df = df.sample(n=2000, random_state=42)
df.head(3)

Unnamed: 0,sentence_id,tokens,ner_tags
13389,22048,"[The, report, calls, on, President, Bush, and, Congress, to, urge, Chinese, officials, not, to, use, the, global, war, against, terrorism, as, a, pretext, to, suppress, minorities, ', rights, .]","[0, 0, 0, 0, 1, 2, 0, 3, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
3035,1273,"[The, construction, on, the, Baku-T'bilisi-Ceyhan, oil, pipeline, ,, the, Baku-T'bilisi-Erzerum, gas, pipeline, ,, and, the, Kars-Akhalkalaki, Railroad, are, part, of, a, strategy, to, capitalize, on, Georgia, 's, strategic, location, between, Europe, and, Asia, and, develop, its, role, as, a, transit, point, for, gas, ,, oil, and, other, goods, .]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 5, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
6013,1541,"[The, pact, was, initially, approved, after, discussions, between, President, Bush, and, Peruvian, President, Alan, Garcia, ,, but, Democrats, in, Congress, forced, U.S., officials, to, reopen, negotiations, and, add, stronger, labor, and, environmental, provisions, .]","[0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 7, 1, 2, 2, 0, 0, 3, 0, 3, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


In [3]:
def process_data(row):
    sentence = " ".join(row['tokens'])
    ner_tags = row['ner_tags']
    
    output_tokens = row['tokens'].copy()
    for i, tag in enumerate(ner_tags):
        if tag != 0:
            if i == 0 or ner_tags[i-1] == 0:
                output_tokens[i] = "<ENT>"
            else:
                output_tokens[i] = "<CONT>"
    
    output_sentence = " ".join(output_tokens)
    input_sentence = f"Entity Extraction: {sentence}"
    return pd.Series([input_sentence, output_sentence], index=['input', 'output'])

train_data = df.apply(process_data, axis=1)
pd.set_option('display.max_colwidth', None)
train_data.head(1)

Unnamed: 0,input,output
13389,Entity Extraction: The report calls on President Bush and Congress to urge Chinese officials not to use the global war against terrorism as a pretext to suppress minorities ' rights .,The report calls on <ENT> <CONT> and <ENT> to urge <ENT> officials not to use the global war against terrorism as a pretext to suppress minorities ' rights .


In [4]:
from sklearn.model_selection import train_test_split

# Split the dataset into training, testing, and validation sets
train_df, temp_df = train_test_split(train_data, test_size=0.3, random_state=42)
val_data, test_data = train_test_split(temp_df, test_size=0.5, random_state=42)

In [5]:
from transformers import T5Tokenizer
from datasets import Dataset
tokenizer = T5Tokenizer.from_pretrained('t5-base')

train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)
val_dataset = Dataset.from_pandas(val_data)

def encoder(data):
    inputs = data["input"]
    targets = data["output"]
    model_inputs = tokenizer(inputs, max_length=128, padding="max_length", truncation=True)
    labels = tokenizer(text_target=targets, max_length=128, padding="max_length", truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply the tokenization
train_encoded = train_dataset.map(encoder, batched=True)
test_encoded = test_dataset.map(encoder, batched=True)
val_encoded = val_dataset.map(encoder, batched=True)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

In [6]:
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments

# Initialize the model
model = T5ForConditionalGeneration.from_pretrained('t5-base')

# Define the training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    eval_strategy="steps",  # Added for periodic evaluation
    save_total_limit=1, 
    save_steps=100,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_encoded,
    eval_dataset=val_encoded,
)

# Train the model
trainer.train()

  0%|          | 0/1250 [00:00<?, ?it/s]

{'loss': 11.0734, 'grad_norm': 37.30490493774414, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.04}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 11.52392864227295, 'eval_runtime': 1.3324, 'eval_samples_per_second': 225.161, 'eval_steps_per_second': 28.52, 'epoch': 0.04}
{'loss': 10.476, 'grad_norm': 46.46704864501953, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.08}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 11.112187385559082, 'eval_runtime': 1.3295, 'eval_samples_per_second': 225.654, 'eval_steps_per_second': 28.583, 'epoch': 0.08}
{'loss': 10.1253, 'grad_norm': 40.07225799560547, 'learning_rate': 3e-06, 'epoch': 0.12}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 10.46669864654541, 'eval_runtime': 1.3381, 'eval_samples_per_second': 224.205, 'eval_steps_per_second': 28.399, 'epoch': 0.12}
{'loss': 9.4181, 'grad_norm': 32.94902420043945, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.16}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 9.65477180480957, 'eval_runtime': 1.3385, 'eval_samples_per_second': 224.126, 'eval_steps_per_second': 28.389, 'epoch': 0.16}
{'loss': 9.2374, 'grad_norm': 32.00311279296875, 'learning_rate': 5e-06, 'epoch': 0.2}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 8.68787956237793, 'eval_runtime': 1.4109, 'eval_samples_per_second': 212.635, 'eval_steps_per_second': 26.934, 'epoch': 0.2}
{'loss': 8.4773, 'grad_norm': 31.251806259155273, 'learning_rate': 6e-06, 'epoch': 0.24}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 7.5796613693237305, 'eval_runtime': 1.4076, 'eval_samples_per_second': 213.128, 'eval_steps_per_second': 26.996, 'epoch': 0.24}
{'loss': 7.0657, 'grad_norm': 27.23091697692871, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.28}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 6.358649253845215, 'eval_runtime': 1.6878, 'eval_samples_per_second': 177.742, 'eval_steps_per_second': 22.514, 'epoch': 0.28}
{'loss': 6.12, 'grad_norm': 29.129714965820312, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.32}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 5.036281108856201, 'eval_runtime': 1.7565, 'eval_samples_per_second': 170.79, 'eval_steps_per_second': 21.633, 'epoch': 0.32}
{'loss': 5.0688, 'grad_norm': 21.42456817626953, 'learning_rate': 9e-06, 'epoch': 0.36}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 3.629814386367798, 'eval_runtime': 1.6376, 'eval_samples_per_second': 183.194, 'eval_steps_per_second': 23.205, 'epoch': 0.36}
{'loss': 3.9457, 'grad_norm': 22.95743179321289, 'learning_rate': 1e-05, 'epoch': 0.4}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 2.1798815727233887, 'eval_runtime': 1.6479, 'eval_samples_per_second': 182.05, 'eval_steps_per_second': 23.06, 'epoch': 0.4}
{'loss': 2.7638, 'grad_norm': 26.820850372314453, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.44}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 1.0867317914962769, 'eval_runtime': 2.0146, 'eval_samples_per_second': 148.912, 'eval_steps_per_second': 18.862, 'epoch': 0.44}
{'loss': 1.7392, 'grad_norm': 12.4657621383667, 'learning_rate': 1.2e-05, 'epoch': 0.48}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.6008166670799255, 'eval_runtime': 1.7885, 'eval_samples_per_second': 167.742, 'eval_steps_per_second': 21.247, 'epoch': 0.48}
{'loss': 1.4542, 'grad_norm': 7.220598220825195, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.52}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.41805389523506165, 'eval_runtime': 1.7215, 'eval_samples_per_second': 174.265, 'eval_steps_per_second': 22.074, 'epoch': 0.52}
{'loss': 0.8918, 'grad_norm': 6.09201192855835, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.56}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.36001017689704895, 'eval_runtime': 1.7432, 'eval_samples_per_second': 172.1, 'eval_steps_per_second': 21.799, 'epoch': 0.56}
{'loss': 0.6514, 'grad_norm': 2.089306116104126, 'learning_rate': 1.5e-05, 'epoch': 0.6}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.3177344501018524, 'eval_runtime': 1.6857, 'eval_samples_per_second': 177.965, 'eval_steps_per_second': 22.542, 'epoch': 0.6}
{'loss': 0.4705, 'grad_norm': 3.1086816787719727, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.64}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.2758188247680664, 'eval_runtime': 1.6831, 'eval_samples_per_second': 178.246, 'eval_steps_per_second': 22.578, 'epoch': 0.64}
{'loss': 0.3617, 'grad_norm': 1.7799618244171143, 'learning_rate': 1.7000000000000003e-05, 'epoch': 0.68}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.24247121810913086, 'eval_runtime': 1.6689, 'eval_samples_per_second': 179.761, 'eval_steps_per_second': 22.77, 'epoch': 0.68}
{'loss': 0.2759, 'grad_norm': 2.0015690326690674, 'learning_rate': 1.8e-05, 'epoch': 0.72}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.20664985477924347, 'eval_runtime': 1.49, 'eval_samples_per_second': 201.345, 'eval_steps_per_second': 25.504, 'epoch': 0.72}
{'loss': 0.2414, 'grad_norm': 1.07077157497406, 'learning_rate': 1.9e-05, 'epoch': 0.76}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.17008961737155914, 'eval_runtime': 1.6671, 'eval_samples_per_second': 179.954, 'eval_steps_per_second': 22.794, 'epoch': 0.76}
{'loss': 0.205, 'grad_norm': 1.1643494367599487, 'learning_rate': 2e-05, 'epoch': 0.8}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.13819143176078796, 'eval_runtime': 1.6138, 'eval_samples_per_second': 185.893, 'eval_steps_per_second': 23.546, 'epoch': 0.8}
{'loss': 0.163, 'grad_norm': 0.9753833413124084, 'learning_rate': 2.1e-05, 'epoch': 0.84}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.0924573615193367, 'eval_runtime': 1.9259, 'eval_samples_per_second': 155.774, 'eval_steps_per_second': 19.731, 'epoch': 0.84}
{'loss': 0.1475, 'grad_norm': 1.1909245252609253, 'learning_rate': 2.2000000000000003e-05, 'epoch': 0.88}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.06957802921533585, 'eval_runtime': 1.7921, 'eval_samples_per_second': 167.405, 'eval_steps_per_second': 21.205, 'epoch': 0.88}
{'loss': 0.124, 'grad_norm': 0.8041754961013794, 'learning_rate': 2.3000000000000003e-05, 'epoch': 0.92}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.057832248508930206, 'eval_runtime': 1.6991, 'eval_samples_per_second': 176.565, 'eval_steps_per_second': 22.365, 'epoch': 0.92}
{'loss': 0.1136, 'grad_norm': 0.7044203877449036, 'learning_rate': 2.4e-05, 'epoch': 0.96}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.052489399909973145, 'eval_runtime': 1.6788, 'eval_samples_per_second': 178.694, 'eval_steps_per_second': 22.635, 'epoch': 0.96}
{'loss': 0.0808, 'grad_norm': 0.6267316937446594, 'learning_rate': 2.5e-05, 'epoch': 1.0}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.04706688970327377, 'eval_runtime': 1.6859, 'eval_samples_per_second': 177.942, 'eval_steps_per_second': 22.539, 'epoch': 1.0}
{'loss': 0.0893, 'grad_norm': 0.7826982140541077, 'learning_rate': 2.6000000000000002e-05, 'epoch': 1.04}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.042961038649082184, 'eval_runtime': 1.7621, 'eval_samples_per_second': 170.255, 'eval_steps_per_second': 21.566, 'epoch': 1.04}
{'loss': 0.0864, 'grad_norm': 0.8148181438446045, 'learning_rate': 2.7000000000000002e-05, 'epoch': 1.08}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.03859839215874672, 'eval_runtime': 1.6594, 'eval_samples_per_second': 180.792, 'eval_steps_per_second': 22.9, 'epoch': 1.08}
{'loss': 0.0838, 'grad_norm': 0.7725770473480225, 'learning_rate': 2.8000000000000003e-05, 'epoch': 1.12}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.03606203570961952, 'eval_runtime': 1.6382, 'eval_samples_per_second': 183.126, 'eval_steps_per_second': 23.196, 'epoch': 1.12}
{'loss': 0.0565, 'grad_norm': 0.7234928011894226, 'learning_rate': 2.9e-05, 'epoch': 1.16}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.03492185100913048, 'eval_runtime': 1.6747, 'eval_samples_per_second': 179.136, 'eval_steps_per_second': 22.691, 'epoch': 1.16}
{'loss': 0.0517, 'grad_norm': 0.6713271737098694, 'learning_rate': 3e-05, 'epoch': 1.2}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.034162260591983795, 'eval_runtime': 1.6739, 'eval_samples_per_second': 179.217, 'eval_steps_per_second': 22.701, 'epoch': 1.2}
{'loss': 0.0699, 'grad_norm': 0.6595180630683899, 'learning_rate': 3.1e-05, 'epoch': 1.24}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.030170535668730736, 'eval_runtime': 1.8932, 'eval_samples_per_second': 158.464, 'eval_steps_per_second': 20.072, 'epoch': 1.24}
{'loss': 0.0541, 'grad_norm': 0.7472473978996277, 'learning_rate': 3.2000000000000005e-05, 'epoch': 1.28}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.02849797159433365, 'eval_runtime': 1.7218, 'eval_samples_per_second': 174.24, 'eval_steps_per_second': 22.07, 'epoch': 1.28}
{'loss': 0.0637, 'grad_norm': 0.5648605823516846, 'learning_rate': 3.3e-05, 'epoch': 1.32}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.03096911683678627, 'eval_runtime': 1.7903, 'eval_samples_per_second': 167.565, 'eval_steps_per_second': 21.225, 'epoch': 1.32}
{'loss': 0.0553, 'grad_norm': 0.5091552138328552, 'learning_rate': 3.4000000000000007e-05, 'epoch': 1.36}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.028003139421343803, 'eval_runtime': 1.6455, 'eval_samples_per_second': 182.32, 'eval_steps_per_second': 23.094, 'epoch': 1.36}
{'loss': 0.0493, 'grad_norm': 0.4452930986881256, 'learning_rate': 3.5e-05, 'epoch': 1.4}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.026311954483389854, 'eval_runtime': 1.6083, 'eval_samples_per_second': 186.528, 'eval_steps_per_second': 23.627, 'epoch': 1.4}
{'loss': 0.0466, 'grad_norm': 0.3099220097064972, 'learning_rate': 3.6e-05, 'epoch': 1.44}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.026950689032673836, 'eval_runtime': 1.6459, 'eval_samples_per_second': 182.273, 'eval_steps_per_second': 23.088, 'epoch': 1.44}
{'loss': 0.0456, 'grad_norm': 0.5695082545280457, 'learning_rate': 3.7e-05, 'epoch': 1.48}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.023845963180065155, 'eval_runtime': 1.6712, 'eval_samples_per_second': 179.507, 'eval_steps_per_second': 22.737, 'epoch': 1.48}
{'loss': 0.0377, 'grad_norm': 0.5810704827308655, 'learning_rate': 3.8e-05, 'epoch': 1.52}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.02308092825114727, 'eval_runtime': 1.6785, 'eval_samples_per_second': 178.73, 'eval_steps_per_second': 22.639, 'epoch': 1.52}
{'loss': 0.0362, 'grad_norm': 0.48469221591949463, 'learning_rate': 3.9000000000000006e-05, 'epoch': 1.56}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.023108409717679024, 'eval_runtime': 1.6794, 'eval_samples_per_second': 178.634, 'eval_steps_per_second': 22.627, 'epoch': 1.56}
{'loss': 0.041, 'grad_norm': 0.6170305013656616, 'learning_rate': 4e-05, 'epoch': 1.6}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.021715650334954262, 'eval_runtime': 1.6826, 'eval_samples_per_second': 178.297, 'eval_steps_per_second': 22.584, 'epoch': 1.6}
{'loss': 0.0661, 'grad_norm': 0.47529157996177673, 'learning_rate': 4.1e-05, 'epoch': 1.64}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.02029864676296711, 'eval_runtime': 1.9888, 'eval_samples_per_second': 150.843, 'eval_steps_per_second': 19.107, 'epoch': 1.64}
{'loss': 0.0379, 'grad_norm': 0.42998820543289185, 'learning_rate': 4.2e-05, 'epoch': 1.68}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.020390115678310394, 'eval_runtime': 1.646, 'eval_samples_per_second': 182.26, 'eval_steps_per_second': 23.086, 'epoch': 1.68}
{'loss': 0.0426, 'grad_norm': 0.6337922811508179, 'learning_rate': 4.3e-05, 'epoch': 1.72}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.020627517253160477, 'eval_runtime': 1.6214, 'eval_samples_per_second': 185.029, 'eval_steps_per_second': 23.437, 'epoch': 1.72}
{'loss': 0.0357, 'grad_norm': 0.4839968979358673, 'learning_rate': 4.4000000000000006e-05, 'epoch': 1.76}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.01876315474510193, 'eval_runtime': 1.6546, 'eval_samples_per_second': 181.315, 'eval_steps_per_second': 22.967, 'epoch': 1.76}
{'loss': 0.0359, 'grad_norm': 0.19934968650341034, 'learning_rate': 4.5e-05, 'epoch': 1.8}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.019160546362400055, 'eval_runtime': 1.6895, 'eval_samples_per_second': 177.572, 'eval_steps_per_second': 22.493, 'epoch': 1.8}
{'loss': 0.0335, 'grad_norm': 0.4677280783653259, 'learning_rate': 4.600000000000001e-05, 'epoch': 1.84}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.018335120752453804, 'eval_runtime': 1.6925, 'eval_samples_per_second': 177.25, 'eval_steps_per_second': 22.452, 'epoch': 1.84}
{'loss': 0.0426, 'grad_norm': 0.2635994255542755, 'learning_rate': 4.7e-05, 'epoch': 1.88}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.016371915116906166, 'eval_runtime': 1.6724, 'eval_samples_per_second': 179.382, 'eval_steps_per_second': 22.722, 'epoch': 1.88}
{'loss': 0.0295, 'grad_norm': 0.40386179089546204, 'learning_rate': 4.8e-05, 'epoch': 1.92}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.016527067869901657, 'eval_runtime': 1.6466, 'eval_samples_per_second': 182.199, 'eval_steps_per_second': 23.079, 'epoch': 1.92}
{'loss': 0.0293, 'grad_norm': 0.2828763723373413, 'learning_rate': 4.9e-05, 'epoch': 1.96}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.01732897385954857, 'eval_runtime': 1.6994, 'eval_samples_per_second': 176.532, 'eval_steps_per_second': 22.361, 'epoch': 1.96}
{'loss': 0.037, 'grad_norm': 0.40370094776153564, 'learning_rate': 5e-05, 'epoch': 2.0}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.015965275466442108, 'eval_runtime': 1.6522, 'eval_samples_per_second': 181.574, 'eval_steps_per_second': 22.999, 'epoch': 2.0}
{'loss': 0.0276, 'grad_norm': 0.557058572769165, 'learning_rate': 4.933333333333334e-05, 'epoch': 2.04}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.016129296272993088, 'eval_runtime': 1.8987, 'eval_samples_per_second': 158.005, 'eval_steps_per_second': 20.014, 'epoch': 2.04}
{'loss': 0.0294, 'grad_norm': 0.2762199938297272, 'learning_rate': 4.866666666666667e-05, 'epoch': 2.08}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.015104379504919052, 'eval_runtime': 1.7503, 'eval_samples_per_second': 171.398, 'eval_steps_per_second': 21.71, 'epoch': 2.08}
{'loss': 0.028, 'grad_norm': 0.5670009851455688, 'learning_rate': 4.8e-05, 'epoch': 2.12}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.015570211224257946, 'eval_runtime': 1.6982, 'eval_samples_per_second': 176.657, 'eval_steps_per_second': 22.377, 'epoch': 2.12}
{'loss': 0.0264, 'grad_norm': 0.3310043513774872, 'learning_rate': 4.7333333333333336e-05, 'epoch': 2.16}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.016095656901597977, 'eval_runtime': 1.6735, 'eval_samples_per_second': 179.261, 'eval_steps_per_second': 22.706, 'epoch': 2.16}
{'loss': 0.0332, 'grad_norm': 0.49520763754844666, 'learning_rate': 4.666666666666667e-05, 'epoch': 2.2}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.01480261329561472, 'eval_runtime': 1.6453, 'eval_samples_per_second': 182.335, 'eval_steps_per_second': 23.096, 'epoch': 2.2}
{'loss': 0.0212, 'grad_norm': 0.32576143741607666, 'learning_rate': 4.600000000000001e-05, 'epoch': 2.24}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.014142587780952454, 'eval_runtime': 1.7101, 'eval_samples_per_second': 175.427, 'eval_steps_per_second': 22.221, 'epoch': 2.24}
{'loss': 0.0246, 'grad_norm': 0.513897716999054, 'learning_rate': 4.5333333333333335e-05, 'epoch': 2.28}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.0149917583912611, 'eval_runtime': 1.6688, 'eval_samples_per_second': 179.77, 'eval_steps_per_second': 22.771, 'epoch': 2.28}
{'loss': 0.0258, 'grad_norm': 0.3642690181732178, 'learning_rate': 4.466666666666667e-05, 'epoch': 2.32}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.015107515268027782, 'eval_runtime': 1.652, 'eval_samples_per_second': 181.596, 'eval_steps_per_second': 23.002, 'epoch': 2.32}
{'loss': 0.0332, 'grad_norm': 0.7385220527648926, 'learning_rate': 4.4000000000000006e-05, 'epoch': 2.36}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.0140707241371274, 'eval_runtime': 1.6593, 'eval_samples_per_second': 180.794, 'eval_steps_per_second': 22.901, 'epoch': 2.36}
{'loss': 0.0263, 'grad_norm': 0.34153300523757935, 'learning_rate': 4.3333333333333334e-05, 'epoch': 2.4}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.015152749605476856, 'eval_runtime': 1.6399, 'eval_samples_per_second': 182.943, 'eval_steps_per_second': 23.173, 'epoch': 2.4}
{'loss': 0.0292, 'grad_norm': 0.33734652400016785, 'learning_rate': 4.266666666666667e-05, 'epoch': 2.44}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.015224709175527096, 'eval_runtime': 1.9102, 'eval_samples_per_second': 157.051, 'eval_steps_per_second': 19.893, 'epoch': 2.44}
{'loss': 0.0293, 'grad_norm': 0.1595005840063095, 'learning_rate': 4.2e-05, 'epoch': 2.48}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.013319321908056736, 'eval_runtime': 1.744, 'eval_samples_per_second': 172.021, 'eval_steps_per_second': 21.789, 'epoch': 2.48}
{'loss': 0.0272, 'grad_norm': 0.22322508692741394, 'learning_rate': 4.133333333333333e-05, 'epoch': 2.52}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.013033398427069187, 'eval_runtime': 1.6949, 'eval_samples_per_second': 177.007, 'eval_steps_per_second': 22.421, 'epoch': 2.52}
{'loss': 0.0265, 'grad_norm': 0.3281700313091278, 'learning_rate': 4.066666666666667e-05, 'epoch': 2.56}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.014577056281268597, 'eval_runtime': 1.6961, 'eval_samples_per_second': 176.873, 'eval_steps_per_second': 22.404, 'epoch': 2.56}
{'loss': 0.0246, 'grad_norm': 0.39065128564834595, 'learning_rate': 4e-05, 'epoch': 2.6}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.01276024617254734, 'eval_runtime': 1.6099, 'eval_samples_per_second': 186.351, 'eval_steps_per_second': 23.604, 'epoch': 2.6}
{'loss': 0.0233, 'grad_norm': 0.33200326561927795, 'learning_rate': 3.933333333333333e-05, 'epoch': 2.64}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.012625910341739655, 'eval_runtime': 1.6233, 'eval_samples_per_second': 184.805, 'eval_steps_per_second': 23.409, 'epoch': 2.64}
{'loss': 0.0269, 'grad_norm': 0.20524439215660095, 'learning_rate': 3.866666666666667e-05, 'epoch': 2.68}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.012021278962492943, 'eval_runtime': 1.6321, 'eval_samples_per_second': 183.816, 'eval_steps_per_second': 23.283, 'epoch': 2.68}
{'loss': 0.0507, 'grad_norm': 0.11464304476976395, 'learning_rate': 3.8e-05, 'epoch': 2.72}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.012151345610618591, 'eval_runtime': 1.422, 'eval_samples_per_second': 210.966, 'eval_steps_per_second': 26.722, 'epoch': 2.72}
{'loss': 0.0298, 'grad_norm': 0.48058974742889404, 'learning_rate': 3.733333333333334e-05, 'epoch': 2.76}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.011686369776725769, 'eval_runtime': 1.4227, 'eval_samples_per_second': 210.867, 'eval_steps_per_second': 26.71, 'epoch': 2.76}
{'loss': 0.0217, 'grad_norm': 0.30411651730537415, 'learning_rate': 3.6666666666666666e-05, 'epoch': 2.8}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.011056354269385338, 'eval_runtime': 1.4189, 'eval_samples_per_second': 211.432, 'eval_steps_per_second': 26.781, 'epoch': 2.8}
{'loss': 0.0218, 'grad_norm': 0.15640690922737122, 'learning_rate': 3.6e-05, 'epoch': 2.84}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.011171026155352592, 'eval_runtime': 1.422, 'eval_samples_per_second': 210.966, 'eval_steps_per_second': 26.722, 'epoch': 2.84}
{'loss': 0.02, 'grad_norm': 0.21256859600543976, 'learning_rate': 3.5333333333333336e-05, 'epoch': 2.88}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.011294904164969921, 'eval_runtime': 1.4824, 'eval_samples_per_second': 202.378, 'eval_steps_per_second': 25.635, 'epoch': 2.88}
{'loss': 0.0223, 'grad_norm': 0.1965925395488739, 'learning_rate': 3.466666666666667e-05, 'epoch': 2.92}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.010842331685125828, 'eval_runtime': 1.4512, 'eval_samples_per_second': 206.729, 'eval_steps_per_second': 26.186, 'epoch': 2.92}
{'loss': 0.0251, 'grad_norm': 0.20405399799346924, 'learning_rate': 3.4000000000000007e-05, 'epoch': 2.96}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.011167346499860287, 'eval_runtime': 1.4215, 'eval_samples_per_second': 211.051, 'eval_steps_per_second': 26.733, 'epoch': 2.96}
{'loss': 0.0219, 'grad_norm': 0.4015231728553772, 'learning_rate': 3.3333333333333335e-05, 'epoch': 3.0}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.010601326823234558, 'eval_runtime': 1.419, 'eval_samples_per_second': 211.413, 'eval_steps_per_second': 26.779, 'epoch': 3.0}
{'loss': 0.0174, 'grad_norm': 0.16255953907966614, 'learning_rate': 3.266666666666667e-05, 'epoch': 3.04}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.01045700628310442, 'eval_runtime': 1.4251, 'eval_samples_per_second': 210.507, 'eval_steps_per_second': 26.664, 'epoch': 3.04}
{'loss': 0.0158, 'grad_norm': 0.17632035911083221, 'learning_rate': 3.2000000000000005e-05, 'epoch': 3.08}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.01037826668471098, 'eval_runtime': 1.4223, 'eval_samples_per_second': 210.927, 'eval_steps_per_second': 26.717, 'epoch': 3.08}
{'loss': 0.018, 'grad_norm': 1.4487853050231934, 'learning_rate': 3.1333333333333334e-05, 'epoch': 3.12}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.010184021666646004, 'eval_runtime': 1.4303, 'eval_samples_per_second': 209.751, 'eval_steps_per_second': 26.568, 'epoch': 3.12}
{'loss': 0.0189, 'grad_norm': 0.23110058903694153, 'learning_rate': 3.066666666666667e-05, 'epoch': 3.16}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.010345048271119595, 'eval_runtime': 1.4157, 'eval_samples_per_second': 211.903, 'eval_steps_per_second': 26.841, 'epoch': 3.16}
{'loss': 0.0212, 'grad_norm': 0.2584209442138672, 'learning_rate': 3e-05, 'epoch': 3.2}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.01124563254415989, 'eval_runtime': 1.4112, 'eval_samples_per_second': 212.583, 'eval_steps_per_second': 26.927, 'epoch': 3.2}
{'loss': 0.0235, 'grad_norm': 0.3148351311683655, 'learning_rate': 2.9333333333333336e-05, 'epoch': 3.24}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.010198910720646381, 'eval_runtime': 1.4921, 'eval_samples_per_second': 201.064, 'eval_steps_per_second': 25.468, 'epoch': 3.24}
{'loss': 0.0174, 'grad_norm': 0.4014636278152466, 'learning_rate': 2.8666666666666668e-05, 'epoch': 3.28}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.009779979474842548, 'eval_runtime': 1.4017, 'eval_samples_per_second': 214.029, 'eval_steps_per_second': 27.11, 'epoch': 3.28}
{'loss': 0.02, 'grad_norm': 0.2634580433368683, 'learning_rate': 2.8000000000000003e-05, 'epoch': 3.32}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.009681054390966892, 'eval_runtime': 1.4001, 'eval_samples_per_second': 214.276, 'eval_steps_per_second': 27.142, 'epoch': 3.32}
{'loss': 0.017, 'grad_norm': 0.25788289308547974, 'learning_rate': 2.733333333333333e-05, 'epoch': 3.36}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.01009873952716589, 'eval_runtime': 1.3919, 'eval_samples_per_second': 215.526, 'eval_steps_per_second': 27.3, 'epoch': 3.36}
{'loss': 0.0181, 'grad_norm': 0.46284550428390503, 'learning_rate': 2.6666666666666667e-05, 'epoch': 3.4}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.009939748793840408, 'eval_runtime': 1.3931, 'eval_samples_per_second': 215.35, 'eval_steps_per_second': 27.278, 'epoch': 3.4}
{'loss': 0.0174, 'grad_norm': 0.23878520727157593, 'learning_rate': 2.6000000000000002e-05, 'epoch': 3.44}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.009197220206260681, 'eval_runtime': 1.4121, 'eval_samples_per_second': 212.455, 'eval_steps_per_second': 26.911, 'epoch': 3.44}
{'loss': 0.0229, 'grad_norm': 0.24363406002521515, 'learning_rate': 2.5333333333333337e-05, 'epoch': 3.48}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.009148665703833103, 'eval_runtime': 1.4243, 'eval_samples_per_second': 210.63, 'eval_steps_per_second': 26.68, 'epoch': 3.48}
{'loss': 0.02, 'grad_norm': 0.451996386051178, 'learning_rate': 2.466666666666667e-05, 'epoch': 3.52}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.009235613979399204, 'eval_runtime': 1.5123, 'eval_samples_per_second': 198.373, 'eval_steps_per_second': 25.127, 'epoch': 3.52}
{'loss': 0.0237, 'grad_norm': 0.26342371106147766, 'learning_rate': 2.4e-05, 'epoch': 3.56}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.00939060840755701, 'eval_runtime': 1.4314, 'eval_samples_per_second': 209.581, 'eval_steps_per_second': 26.547, 'epoch': 3.56}
{'loss': 0.0156, 'grad_norm': 0.265473335981369, 'learning_rate': 2.3333333333333336e-05, 'epoch': 3.6}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.009437617845833302, 'eval_runtime': 1.5453, 'eval_samples_per_second': 194.143, 'eval_steps_per_second': 24.591, 'epoch': 3.6}
{'loss': 0.0185, 'grad_norm': 0.2168886810541153, 'learning_rate': 2.2666666666666668e-05, 'epoch': 3.64}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.009137040004134178, 'eval_runtime': 1.4583, 'eval_samples_per_second': 205.725, 'eval_steps_per_second': 26.058, 'epoch': 3.64}
{'loss': 0.0234, 'grad_norm': 0.2815133035182953, 'learning_rate': 2.2000000000000003e-05, 'epoch': 3.68}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.008879167027771473, 'eval_runtime': 1.4547, 'eval_samples_per_second': 206.235, 'eval_steps_per_second': 26.123, 'epoch': 3.68}
{'loss': 0.0227, 'grad_norm': 0.5887842178344727, 'learning_rate': 2.1333333333333335e-05, 'epoch': 3.72}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.008909215219318867, 'eval_runtime': 1.5891, 'eval_samples_per_second': 188.782, 'eval_steps_per_second': 23.912, 'epoch': 3.72}
{'loss': 0.0137, 'grad_norm': 0.12995783984661102, 'learning_rate': 2.0666666666666666e-05, 'epoch': 3.76}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.008930385112762451, 'eval_runtime': 1.6465, 'eval_samples_per_second': 182.206, 'eval_steps_per_second': 23.079, 'epoch': 3.76}
{'loss': 0.0182, 'grad_norm': 0.35130298137664795, 'learning_rate': 2e-05, 'epoch': 3.8}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.008994771167635918, 'eval_runtime': 1.6696, 'eval_samples_per_second': 179.687, 'eval_steps_per_second': 22.76, 'epoch': 3.8}
{'loss': 0.0255, 'grad_norm': 0.19845779240131378, 'learning_rate': 1.9333333333333333e-05, 'epoch': 3.84}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.009274929761886597, 'eval_runtime': 1.6698, 'eval_samples_per_second': 179.667, 'eval_steps_per_second': 22.758, 'epoch': 3.84}
{'loss': 0.015, 'grad_norm': 0.2958223819732666, 'learning_rate': 1.866666666666667e-05, 'epoch': 3.88}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.008868040516972542, 'eval_runtime': 1.7515, 'eval_samples_per_second': 171.282, 'eval_steps_per_second': 21.696, 'epoch': 3.88}
{'loss': 0.0144, 'grad_norm': 0.10839535295963287, 'learning_rate': 1.8e-05, 'epoch': 3.92}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.008732686750590801, 'eval_runtime': 1.8028, 'eval_samples_per_second': 166.411, 'eval_steps_per_second': 21.079, 'epoch': 3.92}
{'loss': 0.0163, 'grad_norm': 0.46745702624320984, 'learning_rate': 1.7333333333333336e-05, 'epoch': 3.96}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.008681092411279678, 'eval_runtime': 1.6614, 'eval_samples_per_second': 180.567, 'eval_steps_per_second': 22.872, 'epoch': 3.96}
{'loss': 0.019, 'grad_norm': 0.23146897554397583, 'learning_rate': 1.6666666666666667e-05, 'epoch': 4.0}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.008544174954295158, 'eval_runtime': 1.5833, 'eval_samples_per_second': 189.475, 'eval_steps_per_second': 24.0, 'epoch': 4.0}
{'loss': 0.019, 'grad_norm': 0.22970540821552277, 'learning_rate': 1.6000000000000003e-05, 'epoch': 4.04}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.008577689528465271, 'eval_runtime': 1.9895, 'eval_samples_per_second': 150.789, 'eval_steps_per_second': 19.1, 'epoch': 4.04}
{'loss': 0.0136, 'grad_norm': 0.16426855325698853, 'learning_rate': 1.5333333333333334e-05, 'epoch': 4.08}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.008803610689938068, 'eval_runtime': 1.7669, 'eval_samples_per_second': 169.792, 'eval_steps_per_second': 21.507, 'epoch': 4.08}
{'loss': 0.0183, 'grad_norm': 0.20937564969062805, 'learning_rate': 1.4666666666666668e-05, 'epoch': 4.12}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.00879368744790554, 'eval_runtime': 1.6998, 'eval_samples_per_second': 176.487, 'eval_steps_per_second': 22.355, 'epoch': 4.12}
{'loss': 0.0161, 'grad_norm': 0.2801029086112976, 'learning_rate': 1.4000000000000001e-05, 'epoch': 4.16}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.008432293310761452, 'eval_runtime': 1.7072, 'eval_samples_per_second': 175.724, 'eval_steps_per_second': 22.258, 'epoch': 4.16}
{'loss': 0.0139, 'grad_norm': 0.26980072259902954, 'learning_rate': 1.3333333333333333e-05, 'epoch': 4.2}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.008348817005753517, 'eval_runtime': 1.8003, 'eval_samples_per_second': 166.64, 'eval_steps_per_second': 21.108, 'epoch': 4.2}
{'loss': 0.0173, 'grad_norm': 0.134576216340065, 'learning_rate': 1.2666666666666668e-05, 'epoch': 4.24}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.008361787535250187, 'eval_runtime': 1.7085, 'eval_samples_per_second': 175.589, 'eval_steps_per_second': 22.241, 'epoch': 4.24}
{'loss': 0.0162, 'grad_norm': 0.4616354703903198, 'learning_rate': 1.2e-05, 'epoch': 4.28}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.008388576097786427, 'eval_runtime': 1.659, 'eval_samples_per_second': 180.834, 'eval_steps_per_second': 22.906, 'epoch': 4.28}
{'loss': 0.0154, 'grad_norm': 0.2851812243461609, 'learning_rate': 1.1333333333333334e-05, 'epoch': 4.32}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.008261127397418022, 'eval_runtime': 1.663, 'eval_samples_per_second': 180.398, 'eval_steps_per_second': 22.85, 'epoch': 4.32}
{'loss': 0.0164, 'grad_norm': 0.47003549337387085, 'learning_rate': 1.0666666666666667e-05, 'epoch': 4.36}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.008004521019756794, 'eval_runtime': 1.6016, 'eval_samples_per_second': 187.315, 'eval_steps_per_second': 23.727, 'epoch': 4.36}
{'loss': 0.009, 'grad_norm': 0.19408054649829865, 'learning_rate': 1e-05, 'epoch': 4.4}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.008034009486436844, 'eval_runtime': 1.6378, 'eval_samples_per_second': 183.172, 'eval_steps_per_second': 23.202, 'epoch': 4.4}
{'loss': 0.013, 'grad_norm': 0.28282231092453003, 'learning_rate': 9.333333333333334e-06, 'epoch': 4.44}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.008044242858886719, 'eval_runtime': 2.8768, 'eval_samples_per_second': 104.283, 'eval_steps_per_second': 13.209, 'epoch': 4.44}
{'loss': 0.0162, 'grad_norm': 0.3451182544231415, 'learning_rate': 8.666666666666668e-06, 'epoch': 4.48}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.008250280283391476, 'eval_runtime': 2.8404, 'eval_samples_per_second': 105.617, 'eval_steps_per_second': 13.378, 'epoch': 4.48}
{'loss': 0.0151, 'grad_norm': 0.2872339189052582, 'learning_rate': 8.000000000000001e-06, 'epoch': 4.52}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.008290392346680164, 'eval_runtime': 3.1692, 'eval_samples_per_second': 94.66, 'eval_steps_per_second': 11.99, 'epoch': 4.52}
{'loss': 0.015, 'grad_norm': 0.15136420726776123, 'learning_rate': 7.333333333333334e-06, 'epoch': 4.56}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.008041823282837868, 'eval_runtime': 3.1552, 'eval_samples_per_second': 95.082, 'eval_steps_per_second': 12.044, 'epoch': 4.56}
{'loss': 0.0089, 'grad_norm': 0.09036611765623093, 'learning_rate': 6.666666666666667e-06, 'epoch': 4.6}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.007937364280223846, 'eval_runtime': 3.1441, 'eval_samples_per_second': 95.417, 'eval_steps_per_second': 12.086, 'epoch': 4.6}
{'loss': 0.0177, 'grad_norm': 0.3624504506587982, 'learning_rate': 6e-06, 'epoch': 4.64}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.007887106388807297, 'eval_runtime': 3.0572, 'eval_samples_per_second': 98.128, 'eval_steps_per_second': 12.43, 'epoch': 4.64}
{'loss': 0.0137, 'grad_norm': 0.2172195017337799, 'learning_rate': 5.333333333333334e-06, 'epoch': 4.68}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.007889598608016968, 'eval_runtime': 3.1329, 'eval_samples_per_second': 95.759, 'eval_steps_per_second': 12.13, 'epoch': 4.68}
{'loss': 0.0151, 'grad_norm': 0.28599119186401367, 'learning_rate': 4.666666666666667e-06, 'epoch': 4.72}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.00799168087542057, 'eval_runtime': 3.1317, 'eval_samples_per_second': 95.794, 'eval_steps_per_second': 12.134, 'epoch': 4.72}
{'loss': 0.0203, 'grad_norm': 0.5378378033638, 'learning_rate': 4.000000000000001e-06, 'epoch': 4.76}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.00792661216109991, 'eval_runtime': 3.1755, 'eval_samples_per_second': 94.472, 'eval_steps_per_second': 11.966, 'epoch': 4.76}
{'loss': 0.0197, 'grad_norm': 0.333296537399292, 'learning_rate': 3.3333333333333333e-06, 'epoch': 4.8}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.007835591211915016, 'eval_runtime': 3.1597, 'eval_samples_per_second': 94.946, 'eval_steps_per_second': 12.026, 'epoch': 4.8}
{'loss': 0.0183, 'grad_norm': 0.16142237186431885, 'learning_rate': 2.666666666666667e-06, 'epoch': 4.84}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.007811718620359898, 'eval_runtime': 3.0781, 'eval_samples_per_second': 97.463, 'eval_steps_per_second': 12.345, 'epoch': 4.84}
{'loss': 0.0118, 'grad_norm': 0.06563033163547516, 'learning_rate': 2.0000000000000003e-06, 'epoch': 4.88}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.007748129311949015, 'eval_runtime': 3.1125, 'eval_samples_per_second': 96.384, 'eval_steps_per_second': 12.209, 'epoch': 4.88}
{'loss': 0.0132, 'grad_norm': 0.16321663558483124, 'learning_rate': 1.3333333333333334e-06, 'epoch': 4.92}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.007708521094173193, 'eval_runtime': 3.1798, 'eval_samples_per_second': 94.347, 'eval_steps_per_second': 11.951, 'epoch': 4.92}
{'loss': 0.0179, 'grad_norm': 0.28067320585250854, 'learning_rate': 6.666666666666667e-07, 'epoch': 4.96}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.007705272175371647, 'eval_runtime': 3.1577, 'eval_samples_per_second': 95.005, 'eval_steps_per_second': 12.034, 'epoch': 4.96}
{'loss': 0.0188, 'grad_norm': 0.2869487702846527, 'learning_rate': 0.0, 'epoch': 5.0}


  0%|          | 0/38 [00:00<?, ?it/s]

{'eval_loss': 0.007703553885221481, 'eval_runtime': 3.0915, 'eval_samples_per_second': 97.039, 'eval_steps_per_second': 12.292, 'epoch': 5.0}
{'train_runtime': 520.0988, 'train_samples_per_second': 19.227, 'train_steps_per_second': 2.403, 'train_loss': 0.7479060190320015, 'epoch': 5.0}


TrainOutput(global_step=1250, training_loss=0.7479060190320015, metrics={'train_runtime': 520.0988, 'train_samples_per_second': 19.227, 'train_steps_per_second': 2.403, 'total_flos': 1522394726400000.0, 'train_loss': 0.7479060190320015, 'epoch': 5.0})

In [9]:
# Inference 
def complete_sentence(input_text):
    inputs = tokenizer(input_text, return_tensors="pt").input_ids.to(device)
    outputs = model.generate(inputs, max_new_tokens=60)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [10]:
input_text = "Entity Extraction: Ken Pu is going to meet Zikun Fu and Chen Yang on Sunday morning."

print("Input: " +input_text)
print("Output: "+complete_sentence(input_text)+"\n")

Input: Entity Extraction: Ken Pu is going to meet Zikun Fu and Chen Yang on Sunday morning.
Output: ENT> CONT> is going to meet ENT> CONT> and ENT> CONT> on ENT> morning.



In [15]:
input_text = "Entity Extraction: Ken Pu and Zikun is going to have a party by the beach with penguins. Chen Yang is coming with Ma HuaTeng tomorrow for a vacation."

print("Input: " +input_text)
print("Output: "+complete_sentence(input_text)+"\n")

Input: Entity Extraction: Ken Pu and Zikun is going to have a party by the beach with penguins. Chen Yang is coming with Ma HuaTeng tomorrow for a vacation.
Output: ENT> CONT> and ENT> is going to have a party by the beach with penguins. ENT> CONT> is coming with ENT> CONT> tomorrow for a vacation.

