# Import data

In [1]:
# Mount to drive

from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/685HW1Data/646 Test

Mounted at /content/drive
/content/drive/MyDrive/685HW1Data/646 Test


In [None]:
# Install dependencies if needed

!pip install sentencepiece
!pip install accelerate
!pip install datasets transformers[sentencepiece]
!pip install Transformers
!pip install rank_bm25
!pip install evaluate
!pip install rouge-score

In [21]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import sentencepiece
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error
from datasets import Dataset, DatasetDict
import evaluate
import json
from torch.utils.data import random_split
from rank_bm25 import BM25Okapi
import torch
import pickle

In [3]:
# Import lamp3 data
with open('data/lamp3/train/time/questions.json', 'r') as file1, open('data/lamp3/train/time/outputs.json') as file2:

  lamp3_train_x = json.load(file1)
  lamp3_train_y = json.load(file2)['golds']

with open('data/lamp3/validation/time/questions.json', 'r') as file1, open('data/lamp3/validation/time/outputs.json') as file2:

  lamp3_test_x = json.load(file1)
  lamp3_test_y = json.load(file2)['golds']

lamp3_val_x, lamp3_train_x = random_split(lamp3_train_x, [len(lamp3_test_x), len(lamp3_train_x) - len(lamp3_test_x)])
lamp3_val_y, lamp3_train_y = random_split(lamp3_train_y, [len(lamp3_test_y), len(lamp3_train_y) - len(lamp3_test_y)])

#Import lamp7 data
with open('data/lamp7/train/time/questions.json', 'r') as file1, open('data/lamp7/train/time/outputs.json') as file2:

  lamp7_train_x = json.load(file1)
  lamp7_train_y = json.load(file2)['golds']

with open('data/lamp7/validation/time/questions.json', 'r') as file1, open('data/lamp7/validation/time/outputs.json') as file2:

  lamp7_test_x = json.load(file1)
  lamp7_test_y = json.load(file2)['golds']

lamp7_val_x, lamp7_train_x = random_split(lamp7_train_x, [len(lamp7_test_x), len(lamp7_train_x) - len(lamp7_test_x)])
lamp7_val_y, lamp7_train_y = random_split(lamp7_train_y, [len(lamp7_test_y), len(lamp7_train_y) - len(lamp7_test_y)])

# Prep query generation

In [5]:
# Generate query for retrieval by removing template from input based on LaMP Dataset
def query_gen(input, lamp):
  if (lamp == 3):
    template = "What is the score of the following review on a scale of 1 to 5? just answer with 1, 2, 3, 4, or 5 without further explanation. review: "
  elif (lamp == 7):
    template = "Paraphrase the following tweet wihout any explanation before or after it:  "
  input = input[len(template):]
  return input

In [6]:
# Test query generation
input = lamp3_train_x[0]['input']
print(input)

query = query_gen(input, 3)
print(query + "\n")

input = lamp7_train_x[0]['input']
print(input)

query = query_gen(input, 7)
print(query)

What is the score of the following review on a scale of 1 to 5? just answer with 1, 2, 3, 4, or 5 without further explanation. review: It is a bit overkill for what I need, but because it is plastic and I have several tanks, I guess it's good that I have some extras in case I need them.
This does a good job of controlling water flow. I am happy with this purchase.
It is a bit overkill for what I need, but because it is plastic and I have several tanks, I guess it's good that I have some extras in case I need them.
This does a good job of controlling water flow. I am happy with this purchase.

Paraphrase the following tweet without any explanation before or after it: I'm experiencing discomfort in my stomach because I consumed an excessive amount of candy.
I'm experiencing discomfort in my stomach because I consumed an excessive amount of candy.


# Create retrieval models

### BM25

In [7]:
# Pass in a json object
def bm25(profile, query, k=10):

  # Tokenize the corpus and the query
  corpus = [sample['text'].split(" ") for sample in profile]
  query = query.split(" ")

  # Create a model using the corpus
  retrieval = BM25Okapi(corpus)

  # Produce scores
  scores = retrieval.get_scores(query)

  top_idx = np.flip(np.argsort(scores))[:k]
  scores = scores[top_idx]
  top_docs = [profile[i] for i in top_idx]

  return scores, top_docs

In [8]:
example = lamp3_val_x[0]
profile = example['profile']
query = query_gen(example['input'], lamp=3)
print(profile[0])
print(query)

scores, top_docs = bm25(profile[:3], query, k=3)
print(scores)
print(top_docs)

{'id': '97870', 'text': "I first bought the Mother's Clay Bar system a year ago.  It sat on the shelf until I tried it on my two year old Honda CR-V.\n\nIt took less than 45 minutes to treat and wax my whole car and the vehicle now looks better than when I bought it.  You can actually hear those little rust causing impurities coming off the finish.  Stop destroying your clear coat or settling for less than a perfectly smooth finish get this product!  I do not own stock in,  or work for,  Mother's Car Care prducts.", 'score': '5', 'date': '2004-12-06'}
I gave this ukulele 3 stars because it's a Kala, one of my favorites brands. The top of this ukulele was warped at the bridge  A little warping is expected on solid top ukuleles near the bridge. The uke also had a ridge across the soundboard just below the neck.  What might be okay on a $25 uke is not acceptable on one costing $190.00.  Nearly all ukuleles today are mass produced, given cursory inspections, if any, then boxed and shipped 

### Recency

In [10]:
# Take in a profile and k value, returning k most recent samples
def recency(profile, k=10):

  return sorted(profile, key = lambda x: x['date'], reverse=True)[:k]

# Should match print of 'sorted_list' in above cell
print(recency(example_list,k=1))

[{'id': '97264100', 'text': 'This thing withstands all temperatures and weather since I have had it over the last 2 years. Highly recommend this cover and just be sure to check the dimensions before you make the purchase.', 'score': '5', 'date': '2016-10-17'}]


# Creating Prompts

This process will involve iterating through each file, and then performing slices down to our two *k* values, 1 and 4, and then also performing recency/bm25 on them at varying *r* values.

In [11]:
def prompt_maker(input, context, lamp, k=1, r=1, format=0): # if format is 0, apply Recency to BM25; if 1, apply BM25 to Recency
  # if our context is in tuple form, that means were passing in results from recency, grab the pre-attached query
  if (type(context) != type((0,0))):
    query = context[1]
    context = context[0]
  else:
    context = context[1]

  # if r == k, then that means BM25 OR Recency, add context and move on (go off of k alone)
  if r != k:
    # replace original ranked list with further ranked list based on format
    if format == 0: # applying recency to bm25
      if (r > len(context)):
        context = recency(context,k)
      else:
        context = recency(context[:r], k)
    elif format == 1:
      if (r > len(context)):
        context = bm25(context,query,k)[1]
      else:
        context = bm25(context[:r], query, k)[1]

  # Now perform the same process, checking k and building prompt based on new ranked list
  if k == 1:
    if lamp == 3:
      return str(context[0]['score']) + " is the score for " + context[0]['text'] + ". " + input
    elif lamp == 7:
      return context[0]['text'] + " is written by a person. Following the given patterns " + input
    else:
      print("Check LaMP")
      return
  elif k == 4:
    if lamp == 3:
      prompt = ""
      for i in range(4):
        prompt += str(context[i]['score']) + " is the score for " + context[i]['text']
        if i != 3:
          prompt += " and "
        else:
          prompt += ". " + input
      return prompt
    elif lamp == 7:
      prompt = ""
      for i in range(4):
        prompt += context[0]['text']
        if i != 3:
          prompt += " and "
        else:
          prompt += " are written by a person. Following the given patterns " + input
      return prompt
    else:
      print("Check LaMP")
      return
  else:
    print("Check k")
    return

# Prepare GPU and create lists of (up to) 50 most relevant/recent

By subsetting r values from the top 50, we can save time by running all intermediate steps ahead of time.

In [12]:
device = "cuda" if torch.cuda.is_available() else "cpu"
kwargs = {'num_workers': 2, 'pin_memory': True} if device=='cuda' else {}

In [None]:
# # Produce and save outputs from datasets for each of our models, at k=20

datasets = ['lamp3_train_x', 'lamp3_val_x', 'lamp3_test_x', 'lamp7_train_x', 'lamp7_val_x', 'lamp7_test_x']

bm25_retrievals_lamp3 = {k:[] for k in datasets}
bm25_retrievals_lamp7 = {k:[] for k in datasets}
recency_retrievals_lamp3 = {k:[] for k in datasets}
recency_retrievals_lamp7 = {k:[] for k in datasets}

for data in datasets:

  if data in datasets[:3]:
    for i in range(len(locals()[data])):

      if (i % 100 == 0):
        print(f"At example {i}")

      profile = locals()[data][i]['profile']
      query   = query_gen(locals()[data][i]['input'],3)

      # In prep for adding recency, will be subsetted during prompt creation
      bm25_retrievals_lamp3[data].append(bm25(profile,query,k=50))

      recency_retrievals_lamp3[data].append([recency(profile,k=50),query])

  else:
    for i in range(len(locals()[data])):

      if (i % 100 == 0):
        print(f"At example {i}")

      profile = locals()[data][i]['profile']
      query   = query_gen(locals()[data][i]['input'],7)

      # In prep for adding recency, will be subsetted during prompt creation
      bm25_retrievals_lamp7[data].append(bm25(profile,query,k=50))

      recency_retrievals_lamp7[data].append([recency(profile,k=50),query])

with open("bm25_retrievals_lamp3.pkl", "wb") as one, open("recency_retrievals_lamp3.pkl", "wb") as two:
    pickle.dump(bm25_retrievals_lamp3, one)
    pickle.dump(recency_retrievals_lamp3, two)

with open("bm25_retrievals_lamp7.pkl", "wb") as one, open("recency_retrievals_lamp7.pkl", "wb") as two:
    pickle.dump(bm25_retrievals_lamp7, one)
    pickle.dump(recency_retrievals_lamp7, two)

In [13]:
bm25_3 = pickle.load(open('bm25_retrievals_lamp3.pkl', 'rb'))['lamp3_val_x'][:500]
bm25_7 = pickle.load(open('bm25_retrievals_lamp7.pkl', 'rb'))['lamp7_val_x'][:500]
recency_3 = pickle.load(open('recency_retrievals_lamp3.pkl', 'rb'))['lamp3_val_x'][:500]
recency_7 = pickle.load(open('recency_retrievals_lamp7.pkl', 'rb'))['lamp7_val_x'][:500]

In [14]:
print(bm25_3[0])
print(recency_3[0])
print(lamp3_val_y[0])

(array([90.09715972, 82.57903285, 81.74269278, 81.30279123, 76.85456227,
       74.15218568, 72.37226556, 71.00654935, 67.8063606 , 65.58795122,
       65.2656395 , 62.83304826, 62.51558698, 61.43816757, 60.97675357,
       60.16242979, 58.98380076, 58.68375931, 58.30528875, 57.40520813,
       55.80610111, 54.68009228, 54.51152214, 54.21798176, 54.13417534,
       53.26684857, 52.78841102, 52.0389922 , 49.78567074, 49.59342505,
       49.47769915, 49.23814096, 49.17302293, 46.96334976, 46.47304743,
       46.33938155, 45.69175077, 44.89332879, 44.57021154, 44.44689324,
       44.29138242, 43.75814138, 43.21064352, 43.19088322, 42.82961142,
       42.49275011, 42.49275011, 42.4232025 , 42.36348685, 41.76601527]), [{'id': '91962560', 'text': 'Ok I would like to start off by saying I like the product. This is a real user writing this which I paid for it.\nNext\nToshiba can you please get a faster internet link for your software updates. The first unit I bought needed 5 files to be update

In [None]:
# Take a look at lengths of lamp7 vs lamp3 AFTER grabbing 50
lamp3 = 0
lamp7 = 0

for i in range(100):
  lamp3 += len(test2['lamp7_train_x'][i][0])
  lamp7 += len(test['lamp7_train_x'][i][1])

print("Average length for LaMP3: " + str(lamp3 / 100))
print("Average length for LaMP7: " + str(lamp7 / 100))

# Tuning hyperparameter r

In [16]:
lamp7_r = [5,8,11,14]
lamp3_r = [5,10,25,50]

dataset_3_y = list(lamp3_val_y)[0:500]
dataset_7_y = list(lamp7_val_y)[0:500]

# Below code based on a tutorial on finetuning FlanT5 @ https://www.datacamp.com/tutorial/flan-t5-tutorial
def preprocess(data):

  inputs = [prompt for prompt in data['prompts']]
  model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding=True)
  labels = tokenizer(data['ref'], max_length=512, truncation=True, padding=True)
  model_inputs['labels'] = labels['input_ids']
  return model_inputs

# Below code based on a tutorial on finetuning FlanT5 @ https://www.datacamp.com/tutorial/flan-t5-tutorial
def compute_metrics_3(eval_preds):
   preds, labels = eval_preds

   # decode preds and labels
   labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
   decoded_preds = [int(i) for i in tokenizer.batch_decode(preds, skip_special_tokens=True)]
   decoded_labels = [int(i) for i in tokenizer.batch_decode(labels, skip_special_tokens=True)]

   # Calculate RMSE and MAE
   rmse = mean_squared_error(decoded_labels, decoded_preds, squared=False)
   mae = mean_absolute_error(decoded_labels, decoded_preds)

   return {'root_mean_squared_error': rmse, 'mean_absolute_error': mae}

def compute_metrics(eval_preds):
   preds, labels = eval_preds
   rouge = evaluate.load('rouge')

   # decode preds and labels
   labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
   decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
   decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

   mets = rouge.compute(predictions=decoded_preds, references=decoded_labels)

   return {'rouge-1': mets['rouge1'], 'rouge-l': mets['rougeL']}

# Iterate through our k valuess
for k in [4]:

  # Iterate through the r_values
  for r in lamp7_r:

    # Create a tokenizer, model and collator
    tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
    model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base", device_map="auto").to(device)
    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding=False)

    # Combine data into one DatasetDict, readable by Trainer
    train = Dataset.from_dict({
        'prompts': [prompt_maker(x['input'], rec, 7, 4, 4, 1) for x, rec in zip(lamp7_val_x, recency_7[:400])],
        'ref': [y['output'] for y in dataset_7_y[:400]]
    })
    test = Dataset.from_dict({
        'prompts': [prompt_maker(x['input'], rec, 7, k, r, 1) for x, rec in zip(lamp7_val_x, recency_7[400:])],
        'ref': [y['output'] for y in dataset_7_y[400:]]
    })

    data = DatasetDict({
        'train': train,
        'test': test
    })

    # Below code based on a tutorial on finetuning FlanT5 @ https://www.datacamp.com/tutorial/flan-t5-tutorial
    dataset = data.map(preprocess, batched=True)

    # Drop unnecessary columns
    for x in ['train', 'test']:
      dataset[x] = dataset[x].remove_columns(['prompts', 'ref'])

    training_args = Seq2SeqTrainingArguments(
      output_dir="./bm25results/" + str(k) + "/" + str(r),
      evaluation_strategy="epoch",
      learning_rate=5e-5,
      per_device_train_batch_size=4,
      per_device_eval_batch_size=4,
      weight_decay=5e-4,
      save_total_limit=3,
      num_train_epochs=5,
      predict_with_generate=True,
      push_to_hub=False,
      remove_unused_columns=False
    )

    trainer = Seq2SeqTrainer(
      model=model,
      args=training_args,
      train_dataset=dataset['train'],
      eval_dataset=dataset['test'],
      tokenizer=tokenizer,
      data_collator=data_collator,
      compute_metrics=compute_metrics
    )

    # trainer.train()

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

# Final Models

In [17]:
# Non-personalized

# Create a tokenizer, model and collator
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base", device_map="auto").to(device)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding=True)

# Combine data into one DatasetDict, readable by Trainer
train = Dataset.from_dict({
    'prompts': [x['input'] for x in list(lamp7_train_x)[:5000]],
    'ref': [y['output'] for y in list(lamp7_train_y)[:5000]]
})
test = Dataset.from_dict({
    'prompts': [x['input'] for x in lamp7_test_x[:1000]],
    'ref': [y['output'] for y in lamp7_test_y[:1000]]
})

data = DatasetDict({
    'train': train,
    'test': test
})

# Below code based on a tutorial on finetuning FlanT5 @ https://www.datacamp.com/tutorial/flan-t5-tutorial
dataset = data.map(preprocess, batched=True)

# Drop unnecessary columns
for x in ['train', 'test']:
  dataset[x] = dataset[x].remove_columns(['prompts', 'ref'])

training_args = Seq2SeqTrainingArguments(
  output_dir="./baseModel",
  evaluation_strategy="epoch",
  learning_rate=5e-5,
  per_device_train_batch_size=4,
  per_device_eval_batch_size=4,
  weight_decay=5e-4,
  save_total_limit=3,
  num_train_epochs=10,
  predict_with_generate=True,
  push_to_hub=False,
  remove_unused_columns=False
)

trainer = Seq2SeqTrainer(
  model=model,
  args=training_args,
  train_dataset=dataset['train'],
  eval_dataset=dataset['test'],
  tokenizer=tokenizer,
  data_collator=data_collator,
  compute_metrics=compute_metrics
)

# trainer.train()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [18]:
bm25_3 = pickle.load(open('bm25_retrievals_lamp3.pkl', 'rb'))
bm25_7 = pickle.load(open('bm25_retrievals_lamp7.pkl', 'rb'))
recency_3 = pickle.load(open('recency_retrievals_lamp3.pkl', 'rb'))
recency_7 = pickle.load(open('recency_retrievals_lamp7.pkl', 'rb'))

In [19]:
print(bm25_3['lamp3_train_x'][3000])

(array([25.04031229, 22.16992902, 21.34335864, 21.1402162 , 21.12951467,
       20.96569557, 20.4415726 , 20.34161223, 20.13056041, 20.03354454,
       19.84930992, 19.81731954, 19.47638797, 19.36851122, 19.2078329 ,
       19.19319559, 18.89490699, 18.89039438, 18.72335047, 18.61837601,
       18.55836874, 18.51147801, 17.94326032, 17.56723282, 17.54501382,
       17.47105965, 17.45368879, 17.43574253, 17.29597727, 17.0171867 ,
       16.92843608, 16.85034476, 16.75119143, 16.59043609, 16.43492594,
       16.31630105, 16.28412478, 16.25798728, 16.24022966, 16.2169362 ,
       16.13306307, 16.03602679, 16.03602679, 16.02695901, 15.98063924,
       15.92289041, 15.9043564 , 15.83347229, 15.71236183, 15.57995325]), [{'id': '9533474', 'text': 'This series is really good. I absolutely love the new way the blood is written in this one. Its a little dark in a good way. Shara and the guys are fun too. I am looking forward to the next book.', 'score': '5', 'date': '2017-12-30'}, {'id': '953347

In [20]:
# Personalized

# Create a tokenizer, model and collator
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base", device_map="auto").to(device)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding=True)

# Combine data into one DatasetDict, readable by Trainer

# To update the below for LaMP3 or LaMP7, replace the necessary dataset names in train and test between 3 and 7
# Replace the prompt_maker value to the right of 'rec' with 3 or 7 as well. The next two values are k and r respectively.
# The final value in prompt_maker is format: if applying Recency to BM25, 0. Vice versa, 1.
# Finally, replace "bm25_7" with one of the other datasets above, based on the intermediate step used.
train = Dataset.from_dict({
    'prompts': [prompt_maker(x['input'], rec, 7, 4, 5, 0) for x, rec in zip(lamp7_train_x, bm25_7['lamp7_train_x'][:5000])],
    'ref': [y['output'] for y in list(lamp7_train_y)[:5000]]
})
test = Dataset.from_dict({
    'prompts': [prompt_maker(x['input'], rec, 7, 4, 5, 0) for x, rec in zip(lamp7_test_x, bm25_7['lamp7_test_x'][:1000])],
    'ref': [y['output'] for y in lamp7_test_y[:1000]]
})

data = DatasetDict({
    'train': train,
    'test': test
})

# Below code based on a tutorial on finetuning FlanT5 @ https://www.datacamp.com/tutorial/flan-t5-tutorial
dataset = data.map(preprocess, batched=True)

# Drop unnecessary columns
for x in ['train', 'test']:
  dataset[x] = dataset[x].remove_columns(['prompts', 'ref'])

training_args = Seq2SeqTrainingArguments(
  output_dir="./baseModel",
  evaluation_strategy="epoch",
  learning_rate=5e-5,
  per_device_train_batch_size=4,
  per_device_eval_batch_size=4,
  weight_decay=5e-4,
  save_total_limit=3,
  num_train_epochs=10,
  predict_with_generate=True,
  push_to_hub=False,
  remove_unused_columns=False
)

trainer = Seq2SeqTrainer(
  model=model,
  args=training_args,
  train_dataset=dataset['train'],
  eval_dataset=dataset['test'],
  tokenizer=tokenizer,
  data_collator=data_collator,
  compute_metrics=compute_metrics
)

trainer.train()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Rouge-1,Rouge-l
1,1.5867,0.292681,0.067944,0.066547
2,1.5102,0.26927,0.071028,0.0696
3,1.4792,0.269374,0.070119,0.068612
4,1.4477,0.265511,0.070574,0.069034
5,1.4282,0.261943,0.06923,0.067587
6,1.3825,0.255659,0.068333,0.066143
7,1.361,0.255183,0.071748,0.069527
8,1.3442,0.255377,0.075617,0.072432
9,1.3507,0.253074,0.074083,0.071211
10,1.316,0.25268,0.076054,0.072957




Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]



TrainOutput(global_step=12500, training_loss=1.4696156884765625, metrics={'train_runtime': 4197.9377, 'train_samples_per_second': 11.911, 'train_steps_per_second': 2.978, 'total_flos': 2.6763719721762816e+16, 'train_loss': 1.4696156884765625, 'epoch': 10.0})