In [1]:
import wandb
wandb.login()
import os
os.environ['WANDB_PROJECT'] = 'shortest_v3_model'
import pandas as pd
import networkx as nx
import logging
from random import shuffle
import pandas as pd
from matplotlib import pyplot as plt
import csrgraph as cg
import numpy as np
import random
import string
from itertools import combinations
import pickle
import gc
import os
import json
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from tqdm import tqdm


#os.environ['WANDB_MODE'] = 'disabled'

# set current directory
os.chdir("/cs/student/projects1/aibh/2024/cbaumgar/MSC_THESIS")

[34m[1mwandb[0m: Currently logged in as: [33mcarobgt[0m ([33mcarobgt-ucl[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
  backends.update(_get_backends("networkx.backends"))
  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import os
import shutil
import random
import networkx as nx
import numpy as np
import string
from tqdm import tqdm

def get_graph_with_random_names(size):
    """Creates a grid graph with unique random 2-letter names."""
    num_nodes = size * size
    names = set()
    while len(names) < num_nodes:
        names.add(''.join(random.choices(string.ascii_lowercase, k=2)))
    
    nodes = list(names)
    random.shuffle(nodes) # Important for breaking name predictability
    
    G = nx.DiGraph()
    for r in range(size):
        for c in range(size):
            idx = r * size + c
            u = nodes[idx]
            if c < size - 1: G.add_edge(u, nodes[idx + 1], direction='EAST')
            if c > 0: G.add_edge(u, nodes[idx - 1], direction='WEST')
            if r < size - 1: G.add_edge(u, nodes[idx + size], direction='SOUTH')
            if r > 0: G.add_edge(u, nodes[idx - size], direction='NORTH')
    return G

def generate_random_walk(G, start_node, length):
    """Generates a simple random walk allowing backtracking."""
    path = [start_node]
    current_node = start_node
    while len(path) < length:
        neighbors = list(G.successors(current_node))
        if not neighbors: break
        next_node = random.choice(neighbors)
        path.append(next_node)
        current_node = next_node
    return path

def format_path_to_string(path, G):
    """Converts a list of nodes into a 'NODE DIR NODE...' string."""
    if not path or len(path) < 2: return path[0] if path else ""
    path_str = ""
    for i in range(len(path) - 1):
        u, v = path[i], path[i+1]
        try:
            direction = G.edges[u, v]['direction']
            path_str += f"{u} {direction} "
        except KeyError:
            # Safety check, although this should not be triggered with valid paths
            print(f"Warning: Edge ({u}, {v}) not found in graph. Skipping segment.")
            continue
    path_str += path[-1]
    return path_str


def generate_geometer_example(grid_size, min_walk_len, max_walk_len, max_walk_attempts=100000):
    """
    Generates a single example for the Geometer model using the
    "Persistent Filtering" method, correctly handling multiple shortest paths.
    """
    # Pick task first
    G = get_graph_with_random_names(grid_size)
    nodes = list(G.nodes())
    start_node, goal_node = random.sample(nodes, 2)
    
    try:
        # Find ALL shortest paths and their required node sets.
        all_solution_paths = list(nx.all_shortest_paths(G, source=start_node, target=goal_node))
        if not all_solution_paths:
            return None
        
        # This is now a list of sets of nodes, one for each valid shortest path.
        list_of_required_nodes = [set(p) for p in all_solution_paths]
        
    except (nx.NetworkXNoPath, StopIteration):
        return None # Should not happen on a connected grid

    # Brute-force a valid context.
    for _ in range(max_walk_attempts):
        walk_len = random.randint(min_walk_len, max_walk_len)
        walk_start_node = random.choice(nodes)
        context_walk = generate_random_walk(G, walk_start_node, walk_len)
        
        nodes_in_walk = set(context_walk)
        
        # Find which shortest paths are actually supported by this context
        supported_paths = []
        for i, req_nodes in enumerate(list_of_required_nodes):
            if req_nodes.issubset(nodes_in_walk):
                supported_paths.append(all_solution_paths[i])
        
        # Check if at least one path is supported
        if supported_paths:
            # Pick a target from only the supported paths
            ground_truth_path = random.choice(supported_paths)
            
            context_str = format_path_to_string(context_walk, G)
            target_str = format_path_to_string(ground_truth_path, G)
            
            task_instruction = f"[SHORTEST] [START_NODE] {start_node} [GOAL] {goal_node}"
            prompt = f"[SOS] {context_str} [SEP] {task_instruction} [PLAN]"
            target = f"{target_str} [EOS]"
            
            return prompt, target
    
    return None


def generate_and_save_dataset(n_examples, grid_size, min_walk_len, max_walk_len, output_file_path, description=""):
    """Orchestrates generating and saving the full dataset."""
    print(f"--- Generating {n_examples} examples for {description} ---")
    
    with open(output_file_path, "w", encoding="utf-8") as f:
        pbar = tqdm(total=n_examples, desc=f"Generating {description}")
        generated_count = 0
        attempts = 0
        max_total_attempts = n_examples * 1000  # Prevent infinite loops
        
        while generated_count < n_examples and attempts < max_total_attempts:
            attempts += 1
            example = generate_geometer_example(grid_size, min_walk_len, max_walk_len)
            
            if example:
                prompt, target = example
                f.write(f"{prompt} {target}\n")
                generated_count += 1
                pbar.update(1)
        
        pbar.close()
        
        if attempts >= max_total_attempts:
            print(f"Warning: Hit maximum attempts ({max_total_attempts}). Generated {generated_count}/{n_examples} examples.")
        else:
            success_rate = generated_count / attempts * 100
            print(f"Success rate: {success_rate:.2f}% ({generated_count}/{attempts} attempts)")

In [None]:
if __name__ == "__main__":

    NUM_TRAIN_EXAMPLES = 1000000
    NUM_TEST_EXAMPLES = 10000
    GRID_SIZE = 4
    
    MIN_WALK_LENGTH = 10
    MAX_WALK_LENGTH = 50

    random.seed(42)
    np.random.seed(42)

    output_dir = "sv3_data"
    if os.path.exists(output_dir):
        print(f"Removing existing directory: {output_dir}")
        shutil.rmtree(output_dir)
    print(f"Creating new directory: {output_dir}")
    os.makedirs(output_dir)

    train_file_path = os.path.join(output_dir, "train.txt")
    generate_and_save_dataset(
        n_examples=NUM_TRAIN_EXAMPLES,
        grid_size=GRID_SIZE,
        min_walk_len=MIN_WALK_LENGTH,
        max_walk_len=MAX_WALK_LENGTH,
        output_file_path=train_file_path,
        description="Training Set"
    )


    test_file_path = os.path.join(output_dir, "test.txt")
    generate_and_save_dataset(
        n_examples=NUM_TEST_EXAMPLES,
        grid_size=GRID_SIZE,
        min_walk_len=MIN_WALK_LENGTH,
        max_walk_len=MAX_WALK_LENGTH,
        output_file_path=test_file_path,
        description="Testing Set"
    )

    print(f"\n--- SV3 data generation using 'Persistent Filtering' finished successfully! ---")



Removing existing directory: sv3_data
Creating new directory: sv3_data
--- Generating 1000000 examples for Training Set ---


Generating Training Set:   0%|          | 0/1000000 [00:00<?, ?it/s]

Generating Training Set: 100%|██████████| 1000000/1000000 [01:45<00:00, 9505.75it/s]


Success rate: 100.00% (1000000/1000000 attempts)
--- Generating 10000 examples for Testing Set ---


Generating Testing Set: 100%|██████████| 10000/10000 [00:01<00:00, 9579.47it/s]

Success rate: 100.00% (10000/10000 attempts)

--- SV3 data generation using 'Persistent Filtering' finished successfully! ---





In [27]:
# Analysis tools
from transformers import GPT2Tokenizer
import numpy as np
def analyse_dataset(file_path, tokenizer_path):
    """
    Analyze context/target ratio in existing dataset.
    """
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    
    total_context_tokens = 0
    total_target_tokens = 0
    ratios = []
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if i >= 1000:  # Analyze first 1000 examples
                break
                
            parts = line.strip().split()
            sep_index = parts.index('[PLAN]')
            prompt = ' '.join(parts[:sep_index])
            target = ' '.join(parts[sep_index + 1:])

            
            # Extract context (before [SEP])
            if '[SEP]' in prompt:
                context_part = prompt.split('[SEP]')[0]
            else:
                context_part = prompt
            
            context_tokens = len(tokenizer.encode(context_part))
            target_tokens = len(tokenizer.encode(target))
            
            if target_tokens > 0:
                ratio = context_tokens / target_tokens
                ratios.append(ratio)
                total_context_tokens += context_tokens
                total_target_tokens += target_tokens
    
    if ratios:
        avg_ratio = total_context_tokens / total_target_tokens
        median_ratio = np.median(ratios)
        
        print(f"Dataset Analysis ({len(ratios)} examples):")
        print(f"Average context/target ratio: {avg_ratio:.2f}")
        print(f"Median ratio: {median_ratio:.2f}")
        print(f"Ratio range: {min(ratios):.2f} - {max(ratios):.2f}")
        
        if avg_ratio > 3.0:
            print("Context heavily dominates - consider shorter contexts")
        elif avg_ratio < 0.5:
            print("Target dominates - consider longer contexts") 
        else:
            print("Reasonable balance")

analyse_dataset("sv3_data/train.txt", "gv6_tokenizer")

Dataset Analysis (1000 examples):
Average context/target ratio: 8.82
Median ratio: 9.00
Ratio range: 1.88 - 21.43
Context heavily dominates - consider shorter contexts


**CUSTOM TOKENIZER**

trained in tokenizer_training.ipynb


In [2]:
class GPT:

    def __init__(self, base_model=None, base_model_name='gpt2', vocab_size=100):
        self.base_model = base_model
        self.base_model_name = base_model_name
        self.vocab_size = vocab_size

        if self.base_model is not None:
            self.tokenizer = GPT2Tokenizer.from_pretrained(base_model)
            self.model = GPT2LMHeadModel.from_pretrained(base_model)
            self.tokenizer.pad_token = self.tokenizer.eos_token

    def continue_input(self, input_sequence, max_new_tokens=5, num_return_sequences=1, no_repeat_ngram_size=0,
                       do_sample=False, temperature=0.7, num_beams=1):
        input_ids = self.tokenizer.encode(input_sequence, return_tensors='pt')

        # Generate text
        output = self.model.generate(
            input_ids,
            max_new_tokens=max_new_tokens,
            num_return_sequences=num_return_sequences,
            num_beams=num_beams,
            no_repeat_ngram_size=no_repeat_ngram_size,
            do_sample=do_sample,
            temperature=temperature,
        )

        # Decode the output
        sequence = output[0].tolist()
        text = self.tokenizer.decode(sequence)
        return text
    
    

In [3]:
def load_pkl(pth):
    with open(pth, 'rb') as f:
        d = pickle.load(f)
    return d

def is_valid_path(sequence, graphs):
    # Split the sequence into parts
    parts = sequence.split()

    # Extract nodes and edges; nodes are at even indices, edges at odd indices
    nodes = parts[::2]
    edges = parts[1::2]

    # Convert edges to a lowercase version for comparison (assuming all edges in graphs are lowercase)
    edges = [edge.lower() for edge in edges]

    # Iterate over each graph to check if the path exists
    for graph in graphs:
        path_exists = True
        for i in range(len(nodes) - 1):
            # Check if the current graph has the edge between the current node and the next node
            if not graph.has_edge(nodes[i], nodes[i+1]):
                path_exists = False
                break

        # If path exists in the current graph, return True
        if path_exists:
            return True

    # If none of the graphs contain the path, return False
    return False

In [None]:
from transformers import AutoConfig, GPT2Tokenizer
import os

# new config
TOKENIZER_PATH = "gv6_tokenizer"
CONFIG_SAVE_PATH = "shortest_v3_config_pdrop" 

print(f"Loading custom tokenizer from: {TOKENIZER_PATH}")
my_custom_tokenizer = GPT2Tokenizer.from_pretrained(TOKENIZER_PATH)


print("Loading base gpt2 config...")
config = AutoConfig.from_pretrained('gpt2')


print(f"Overriding config vocab_size. Old: {config.vocab_size}, New: {len(my_custom_tokenizer)}")
config.vocab_size = len(my_custom_tokenizer)
config.bos_token_id = my_custom_tokenizer.bos_token_id
config.eos_token_id = my_custom_tokenizer.eos_token_id
config.pad_token_id = my_custom_tokenizer.pad_token_id
print(f"Set pad_token_id to: {config.pad_token_id}")

new_dropout_rate = 0.5
print(f"Setting all dropout probabilities to: {new_dropout_rate}")
config.attn_pdrop = new_dropout_rate
config.resid_pdrop = new_dropout_rate
config.embd_pdrop = new_dropout_rate

os.makedirs(CONFIG_SAVE_PATH, exist_ok=True)
config.save_pretrained(CONFIG_SAVE_PATH)

print(f"\nCustom model config with high dropout saved successfully to: {CONFIG_SAVE_PATH}")

Loading custom tokenizer from: gv6_tokenizer
Loading base gpt2 config...
Overriding config vocab_size. Old: 50257, New: 1031
Set pad_token_id to: 0
Setting all dropout probabilities to: 0.5

Custom model config with high dropout saved successfully to: shortest_v3_config_pdrop


In [None]:
import os

project_cache_path = os.path.join(os.getcwd(), '.cache', 'huggingface_datasets')
os.makedirs(project_cache_path, exist_ok=True)

os.environ['HF_DATASETS_CACHE'] = project_cache_path

print(f"Hugging Face datasets cache is now set to: {os.environ['HF_DATASETS_CACHE']}")

wandb_path = os.path.join(os.getcwd(), 'wandb_local_runs')
os.makedirs(wandb_path, exist_ok=True)

os.environ['WANDB_DIR'] = wandb_path
print(f"Wandb local run directory set to: {os.environ['WANDB_DIR']}")

import os
os.environ["WANDB__SERVICE_WAIT"] = "120"

Hugging Face datasets cache is now set to: /cs/student/projects1/aibh/2024/cbaumgar/MSC_THESIS/.cache/huggingface_datasets
Wandb local run directory set to: /cs/student/projects1/aibh/2024/cbaumgar/MSC_THESIS/wandb_local_runs


In [None]:
import os
import gc
import wandb

def train_model_script(num_epochs=1,
                       output_dir='sv3_model_ft',
                       lr=5e-05,
                       wandb_run_id=None):
    gc.collect()
    wandb.login()
    os.environ['WANDB_PROJECT'] = 'shortest_path_models'

    save_steps = 1000
    save_total_limit = 10
    weight_decay = 0.1
    beta2 = 0.98

    data_dir = 'sv3_data' 
    tokenizer_dir = 'gv6_tokenizer'
    config_dir = 'shortest_v3_config_pdrop'
    
    train_path = f'./{data_dir}/train.txt'
    test_path = f'./{data_dir}/test.txt'
    if wandb_run_id:
        print(f"Attempting to resume W&B run with ID: {wandb_run_id}")
        os.environ['WANDB_RESUME'] = "allow"
        os.environ['WANDB_RUN_ID'] = wandb_run_id

    command = f"""
    python3 ./train_shortest.py \\
        --config_name {config_dir} \\
        --tokenizer_name {tokenizer_dir} \\
        --train_file {train_path} \\
        --validation_file {test_path} \\
        --per_device_train_batch_size 128 \\
        --per_device_eval_batch_size 128\\
        --do_train \\
        --do_eval \\
        --output_dir {output_dir} \\
        --num_train_epochs {num_epochs} \\
        --save_strategy 'steps' \\
        --save_steps {save_steps} \\
        --save_total_limit {save_total_limit} \\
        --eval_strategy 'steps' \\
        --eval_steps 500 \\
        --warmup_steps 0 \\
        --learning_rate {lr} \\
        --report_to 'wandb' \\
        --lr_scheduler_type cosine \\
        --weight_decay {weight_decay} \\
        --adam_beta2 {beta2} \\
        --fp16 True
    """
    
    print("--- Running Command ---")
    print(command)
    print("-----------------------")
    
    os.system(command)

In [None]:
#!rm -rf sv2_model_fixed/
!mkdir sv3_model_ft/

mkdir: cannot create directory ‘sv3_model_unm/’: File exists


In [None]:


train_model_script(num_epochs=25,
                   output_dir='sv3_model_ft',
                   lr=1e-05, wandb_run_id="9p43e26h")


Attempting to resume W&B run with ID: 9p43e26h
--- Running Command ---

    python3 ./train_shortest.py \
        --config_name shortest_v3_config_pdrop \
        --tokenizer_name gv6_tokenizer \
        --train_file ./sv3_data/train.txt \
        --validation_file ./sv3_data/test.txt \
        --per_device_train_batch_size 128 \
        --per_device_eval_batch_size 128\
        --do_train \
        --do_eval \
        --output_dir sv3_model_ft \
        --num_train_epochs 25 \
        --save_strategy 'steps' \
        --save_steps 1000 \
        --save_total_limit 10 \
        --eval_strategy 'steps' \
        --eval_steps 500 \
        --warmup_steps 0 \
        --learning_rate 1e-05 \
        --report_to 'wandb' \
        --lr_scheduler_type cosine \
        --weight_decay 0.1 \
        --adam_beta2 0.98 \
        --fp16 True
    
-----------------------


  backends.update(_get_backends("networkx.backends"))


INFO:__main__:Checkpoint detected, resuming training at sv3_model_ft/checkpoint-123500. To avoid this behavior, change the `--output_dir` or add `--overwrite_output_dir` to train from scratch.
INFO:__main__:Loading tokenizer, config, and model...
INFO:__main__:--- Initializing a new model from scratch ---
INFO:__main__:Synchronized model embedding size with tokenizer. New size: 1031


  trainer = Trainer(


INFO:__main__:*** Starting Training ***


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].
	save_steps: 1000 (from args) != 500 (from trainer_state.json)
wandb: Currently logged in as: carobgt (carobgt-ucl) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
wandb: creating run
wandb: Tracking run with wandb version 0.20.1
wandb: Run data is saved locally in /cs/student/projects1/aibh/2024/cbaumgar/MSC_THESIS/wandb_local_runs/wandb/run-20250821_142629-9p43e26h
wandb: Run `wandb offline` to turn off syncing.
wandb: Resuming run sv3_model_ft
wandb: ⭐️ View project at https://wandb.ai/carobgt-ucl/shortest_path_models
wandb: 🚀 View run at https://wandb.ai/carobgt-ucl/shortest_path_models/runs/9p43e26h
  0%|          | 0/195325 [00:00<?, ?it/s]`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
 63%|██████▎   | 124000/195325 [06:04<8:34:21,  2.31it/s] 

{'loss': 0.0889, 'grad_norm': 1.0165162086486816, 'learning_rate': 2.9482476497660644e-05, 'epoch': 15.87}



  0%|          | 0/79 [00:00<?, ?it/s][A
  3%|▎         | 2/79 [00:00<00:05, 12.97it/s][A
  5%|▌         | 4/79 [00:00<00:09,  8.07it/s][A
  6%|▋         | 5/79 [00:00<00:09,  7.54it/s][A
  8%|▊         | 6/79 [00:00<00:10,  7.23it/s][A
  9%|▉         | 7/79 [00:00<00:10,  7.00it/s][A
 10%|█         | 8/79 [00:01<00:10,  6.85it/s][A
 11%|█▏        | 9/79 [00:01<00:10,  6.71it/s][A
 13%|█▎        | 10/79 [00:01<00:10,  6.65it/s][A
 14%|█▍        | 11/79 [00:01<00:10,  6.39it/s][A
 15%|█▌        | 12/79 [00:01<00:10,  6.37it/s][A
 16%|█▋        | 13/79 [00:01<00:10,  6.36it/s][A
 18%|█▊        | 14/79 [00:02<00:10,  6.38it/s][A
 19%|█▉        | 15/79 [00:02<00:09,  6.44it/s][A
 20%|██        | 16/79 [00:02<00:09,  6.49it/s][A
 22%|██▏       | 17/79 [00:02<00:09,  6.50it/s][A
 23%|██▎       | 18/79 [00:02<00:09,  6.55it/s][A
 24%|██▍       | 19/79 [00:02<00:09,  6.54it/s][A
 25%|██▌       | 20/79 [00:02<00:08,  6.56it/s][A
 27%|██▋       | 21/79 [00:03<00:08,  6.51it/s

{'eval_loss': 0.1009625494480133, 'eval_token_accuracy': 0.9573899155829341, 'eval_exact_match_accuracy': 0.6261, 'eval_runtime': 13.273, 'eval_samples_per_second': 753.411, 'eval_steps_per_second': 5.952, 'epoch': 15.87}


 64%|██████▎   | 124500/195325 [10:49<8:35:52,  2.29it/s]  
  0%|          | 0/79 [00:00<?, ?it/s][A

{'loss': 0.0896, 'grad_norm': 1.3696781396865845, 'learning_rate': 2.9117190108813792e-05, 'epoch': 15.93}



  3%|▎         | 2/79 [00:00<00:05, 13.08it/s][A
  5%|▌         | 4/79 [00:00<00:09,  8.09it/s][A
  6%|▋         | 5/79 [00:00<00:09,  7.56it/s][A
  8%|▊         | 6/79 [00:00<00:10,  7.24it/s][A
  9%|▉         | 7/79 [00:00<00:10,  7.01it/s][A
 10%|█         | 8/79 [00:01<00:10,  6.86it/s][A
 11%|█▏        | 9/79 [00:01<00:10,  6.72it/s][A
 13%|█▎        | 10/79 [00:01<00:10,  6.67it/s][A
 14%|█▍        | 11/79 [00:01<00:10,  6.59it/s][A
 15%|█▌        | 12/79 [00:01<00:10,  6.52it/s][A
 16%|█▋        | 13/79 [00:01<00:10,  6.46it/s][A
 18%|█▊        | 14/79 [00:02<00:10,  6.45it/s][A
 19%|█▉        | 15/79 [00:02<00:09,  6.50it/s][A
 20%|██        | 16/79 [00:02<00:09,  6.52it/s][A
 22%|██▏       | 17/79 [00:02<00:09,  6.53it/s][A
 23%|██▎       | 18/79 [00:02<00:09,  6.58it/s][A
 24%|██▍       | 19/79 [00:02<00:09,  6.57it/s][A
 25%|██▌       | 20/79 [00:02<00:08,  6.57it/s][A
 27%|██▋       | 21/79 [00:03<00:08,  6.55it/s][A
 28%|██▊       | 22/79 [00:03<00:08, 

{'eval_loss': 0.10647807270288467, 'eval_token_accuracy': 0.9566324435318275, 'eval_exact_match_accuracy': 0.6202, 'eval_runtime': 28.8881, 'eval_samples_per_second': 346.163, 'eval_steps_per_second': 2.735, 'epoch': 15.93}


 64%|██████▍   | 125000/195325 [16:58<9:28:28,  2.06it/s]  
  0%|          | 0/79 [00:00<?, ?it/s][A

{'loss': 0.0909, 'grad_norm': 0.9367993474006653, 'learning_rate': 2.8752520898363257e-05, 'epoch': 16.0}



  3%|▎         | 2/79 [00:00<00:05, 13.03it/s][A
  5%|▌         | 4/79 [00:00<00:09,  8.08it/s][A
  6%|▋         | 5/79 [00:00<00:09,  7.54it/s][A
  8%|▊         | 6/79 [00:00<00:10,  7.23it/s][A
  9%|▉         | 7/79 [00:00<00:10,  7.00it/s][A
 10%|█         | 8/79 [00:01<00:10,  6.85it/s][A
 11%|█▏        | 9/79 [00:01<00:10,  6.71it/s][A
 13%|█▎        | 10/79 [00:01<00:10,  6.65it/s][A
 14%|█▍        | 11/79 [00:01<00:10,  6.58it/s][A
 15%|█▌        | 12/79 [00:01<00:10,  6.51it/s][A
 16%|█▋        | 13/79 [00:01<00:10,  6.46it/s][A
 18%|█▊        | 14/79 [00:02<00:10,  6.45it/s][A
 19%|█▉        | 15/79 [00:02<00:09,  6.50it/s][A
 20%|██        | 16/79 [00:02<00:09,  6.52it/s][A
 22%|██▏       | 17/79 [00:02<00:09,  6.53it/s][A
 23%|██▎       | 18/79 [00:02<00:09,  6.58it/s][A
 24%|██▍       | 19/79 [00:02<00:09,  6.57it/s][A
 25%|██▌       | 20/79 [00:02<00:08,  6.58it/s][A
 27%|██▋       | 21/79 [00:03<00:08,  6.56it/s][A
 28%|██▊       | 22/79 [00:03<00:08, 

{'eval_loss': 0.11856304854154587, 'eval_token_accuracy': 0.9553912845083277, 'eval_exact_match_accuracy': 0.6172, 'eval_runtime': 20.7694, 'eval_samples_per_second': 481.478, 'eval_steps_per_second': 3.804, 'epoch': 16.0}


 64%|██████▍   | 125500/195325 [23:07<8:32:25,  2.27it/s]  
  0%|          | 0/79 [00:00<?, ?it/s][A

{'loss': 0.0896, 'grad_norm': 1.124891757965088, 'learning_rate': 2.838922582214314e-05, 'epoch': 16.06}



  3%|▎         | 2/79 [00:00<00:05, 13.00it/s][A
  5%|▌         | 4/79 [00:00<00:09,  8.07it/s][A
  6%|▋         | 5/79 [00:00<00:09,  7.53it/s][A
  8%|▊         | 6/79 [00:00<00:10,  7.21it/s][A
  9%|▉         | 7/79 [00:00<00:10,  6.99it/s][A
 10%|█         | 8/79 [00:01<00:10,  6.84it/s][A
 11%|█▏        | 9/79 [00:01<00:10,  6.70it/s][A
 13%|█▎        | 10/79 [00:01<00:10,  6.64it/s][A
 14%|█▍        | 11/79 [00:01<00:10,  6.56it/s][A
 15%|█▌        | 12/79 [00:01<00:10,  6.49it/s][A
 16%|█▋        | 13/79 [00:01<00:10,  6.44it/s][A
 18%|█▊        | 14/79 [00:02<00:10,  6.43it/s][A
 19%|█▉        | 15/79 [00:02<00:09,  6.47it/s][A
 20%|██        | 16/79 [00:02<00:09,  6.50it/s][A
 22%|██▏       | 17/79 [00:02<00:09,  6.51it/s][A
 23%|██▎       | 18/79 [00:02<00:09,  6.56it/s][A
 24%|██▍       | 19/79 [00:02<00:09,  6.55it/s][A
 25%|██▌       | 20/79 [00:02<00:08,  6.56it/s][A
 27%|██▋       | 21/79 [00:03<00:08,  6.54it/s][A
 28%|██▊       | 22/79 [00:03<00:08, 

{'eval_loss': 0.10282918810844421, 'eval_token_accuracy': 0.9572986538900297, 'eval_exact_match_accuracy': 0.624, 'eval_runtime': 12.4456, 'eval_samples_per_second': 803.496, 'eval_steps_per_second': 6.348, 'epoch': 16.06}


 65%|██████▍   | 126000/195325 [27:13<8:29:20,  2.27it/s]  
  0%|          | 0/79 [00:00<?, ?it/s][A

{'loss': 0.0901, 'grad_norm': 1.406823754310608, 'learning_rate': 2.8027328375467722e-05, 'epoch': 16.13}



  3%|▎         | 2/79 [00:00<00:05, 13.04it/s][A
  5%|▌         | 4/79 [00:00<00:09,  8.07it/s][A
  6%|▋         | 5/79 [00:00<00:09,  7.54it/s][A
  8%|▊         | 6/79 [00:00<00:10,  7.22it/s][A
  9%|▉         | 7/79 [00:00<00:10,  6.99it/s][A
 10%|█         | 8/79 [00:01<00:10,  6.84it/s][A
 11%|█▏        | 9/79 [00:01<00:10,  6.69it/s][A
 13%|█▎        | 10/79 [00:01<00:10,  6.62it/s][A
 14%|█▍        | 11/79 [00:01<00:10,  6.55it/s][A
 15%|█▌        | 12/79 [00:01<00:10,  6.48it/s][A
 16%|█▋        | 13/79 [00:01<00:10,  6.43it/s][A
 18%|█▊        | 14/79 [00:02<00:10,  6.42it/s][A
 19%|█▉        | 15/79 [00:02<00:09,  6.47it/s][A
 20%|██        | 16/79 [00:02<00:09,  6.49it/s][A
 22%|██▏       | 17/79 [00:02<00:09,  6.50it/s][A
 23%|██▎       | 18/79 [00:02<00:09,  6.55it/s][A
 24%|██▍       | 19/79 [00:02<00:09,  6.54it/s][A
 25%|██▌       | 20/79 [00:02<00:09,  6.55it/s][A
 27%|██▋       | 21/79 [00:03<00:08,  6.53it/s][A
 28%|██▊       | 22/79 [00:03<00:08, 

{'eval_loss': 0.10580505430698395, 'eval_token_accuracy': 0.9567419575633128, 'eval_exact_match_accuracy': 0.618, 'eval_runtime': 12.55, 'eval_samples_per_second': 796.815, 'eval_steps_per_second': 6.295, 'epoch': 16.13}


 65%|██████▍   | 126500/195325 [31:25<8:20:33,  2.29it/s]  
  0%|          | 0/79 [00:00<?, ?it/s][A

{'loss': 0.0901, 'grad_norm': 1.3215935230255127, 'learning_rate': 2.7666851963262717e-05, 'epoch': 16.19}



  3%|▎         | 2/79 [00:00<00:05, 13.04it/s][A
  5%|▌         | 4/79 [00:00<00:09,  8.07it/s][A
  6%|▋         | 5/79 [00:00<00:09,  7.53it/s][A
  8%|▊         | 6/79 [00:00<00:10,  7.21it/s][A
  9%|▉         | 7/79 [00:00<00:10,  6.98it/s][A
 10%|█         | 8/79 [00:01<00:10,  6.83it/s][A
 11%|█▏        | 9/79 [00:01<00:10,  6.70it/s][A
 13%|█▎        | 10/79 [00:01<00:10,  6.64it/s][A
 14%|█▍        | 11/79 [00:01<00:10,  6.56it/s][A
 15%|█▌        | 12/79 [00:01<00:10,  6.49it/s][A
 16%|█▋        | 13/79 [00:01<00:10,  6.44it/s][A
 18%|█▊        | 14/79 [00:02<00:10,  6.43it/s][A
 19%|█▉        | 15/79 [00:02<00:09,  6.47it/s][A
 20%|██        | 16/79 [00:02<00:09,  6.50it/s][A
 22%|██▏       | 17/79 [00:02<00:09,  6.51it/s][A
 23%|██▎       | 18/79 [00:02<00:09,  6.56it/s][A
 24%|██▍       | 19/79 [00:02<00:09,  6.54it/s][A
 25%|██▌       | 20/79 [00:02<00:09,  6.55it/s][A
 27%|██▋       | 21/79 [00:03<00:08,  6.53it/s][A
 28%|██▊       | 22/79 [00:03<00:08, 

{'eval_loss': 0.09518380463123322, 'eval_token_accuracy': 0.9580287474332649, 'eval_exact_match_accuracy': 0.6286, 'eval_runtime': 12.4231, 'eval_samples_per_second': 804.951, 'eval_steps_per_second': 6.359, 'epoch': 16.19}


 65%|██████▌   | 127000/195325 [35:32<8:18:28,  2.28it/s]  
  0%|          | 0/79 [00:00<?, ?it/s][A

{'loss': 0.0897, 'grad_norm': 0.9879333972930908, 'learning_rate': 2.7307819898551485e-05, 'epoch': 16.25}



  3%|▎         | 2/79 [00:00<00:05, 13.02it/s][A
  5%|▌         | 4/79 [00:00<00:09,  8.07it/s][A
  6%|▋         | 5/79 [00:00<00:09,  7.53it/s][A
  8%|▊         | 6/79 [00:00<00:10,  7.22it/s][A
  9%|▉         | 7/79 [00:00<00:10,  6.99it/s][A
 10%|█         | 8/79 [00:01<00:10,  6.84it/s][A
 11%|█▏        | 9/79 [00:01<00:10,  6.70it/s][A
 13%|█▎        | 10/79 [00:01<00:10,  6.64it/s][A
 14%|█▍        | 11/79 [00:01<00:10,  6.57it/s][A
 15%|█▌        | 12/79 [00:01<00:10,  6.50it/s][A
 16%|█▋        | 13/79 [00:01<00:10,  6.45it/s][A
 18%|█▊        | 14/79 [00:02<00:10,  6.43it/s][A
 19%|█▉        | 15/79 [00:02<00:09,  6.48it/s][A
 20%|██        | 16/79 [00:02<00:09,  6.51it/s][A
 22%|██▏       | 17/79 [00:02<00:09,  6.51it/s][A
 23%|██▎       | 18/79 [00:02<00:09,  6.56it/s][A
 24%|██▍       | 19/79 [00:02<00:09,  6.56it/s][A
 25%|██▌       | 20/79 [00:02<00:08,  6.56it/s][A
 27%|██▋       | 21/79 [00:03<00:08,  6.55it/s][A
 28%|██▊       | 22/79 [00:03<00:08, 

{'eval_loss': 0.10404183715581894, 'eval_token_accuracy': 0.9571161305042208, 'eval_exact_match_accuracy': 0.6259, 'eval_runtime': 12.3983, 'eval_samples_per_second': 806.559, 'eval_steps_per_second': 6.372, 'epoch': 16.25}


 65%|██████▌   | 127500/195325 [39:38<8:10:56,  2.30it/s]  
  0%|          | 0/79 [00:00<?, ?it/s][A

{'loss': 0.0891, 'grad_norm': 1.3298487663269043, 'learning_rate': 2.695025540094741e-05, 'epoch': 16.32}



  3%|▎         | 2/79 [00:00<00:05, 13.04it/s][A
  5%|▌         | 4/79 [00:00<00:09,  8.08it/s][A
  6%|▋         | 5/79 [00:00<00:09,  7.54it/s][A
  8%|▊         | 6/79 [00:00<00:10,  7.22it/s][A
  9%|▉         | 7/79 [00:00<00:10,  6.99it/s][A
 10%|█         | 8/79 [00:01<00:10,  6.84it/s][A
 11%|█▏        | 9/79 [00:01<00:10,  6.70it/s][A
 13%|█▎        | 10/79 [00:01<00:10,  6.65it/s][A
 14%|█▍        | 11/79 [00:01<00:10,  6.57it/s][A
 15%|█▌        | 12/79 [00:01<00:10,  6.49it/s][A
 16%|█▋        | 13/79 [00:01<00:10,  6.45it/s][A
 18%|█▊        | 14/79 [00:02<00:10,  6.43it/s][A
 19%|█▉        | 15/79 [00:02<00:09,  6.47it/s][A
 20%|██        | 16/79 [00:02<00:09,  6.50it/s][A
 22%|██▏       | 17/79 [00:02<00:09,  6.51it/s][A
 23%|██▎       | 18/79 [00:02<00:09,  6.56it/s][A
 24%|██▍       | 19/79 [00:02<00:09,  6.55it/s][A
 25%|██▌       | 20/79 [00:02<00:08,  6.56it/s][A
 27%|██▋       | 21/79 [00:03<00:08,  6.54it/s][A
 28%|██▊       | 22/79 [00:03<00:08, 

{'eval_loss': 0.10515214502811432, 'eval_token_accuracy': 0.9569062286105408, 'eval_exact_match_accuracy': 0.6225, 'eval_runtime': 12.4189, 'eval_samples_per_second': 805.224, 'eval_steps_per_second': 6.361, 'epoch': 16.32}


 66%|██████▌   | 128000/195325 [43:45<8:12:30,  2.28it/s]  
  0%|          | 0/79 [00:00<?, ?it/s][A

{'loss': 0.0889, 'grad_norm': 1.2815712690353394, 'learning_rate': 2.659418159515221e-05, 'epoch': 16.38}



  3%|▎         | 2/79 [00:00<00:05, 13.01it/s][A
  5%|▌         | 4/79 [00:00<00:09,  8.06it/s][A
  6%|▋         | 5/79 [00:00<00:09,  7.53it/s][A
  8%|▊         | 6/79 [00:00<00:10,  7.22it/s][A
  9%|▉         | 7/79 [00:00<00:10,  6.99it/s][A
 10%|█         | 8/79 [00:01<00:10,  6.84it/s][A
 11%|█▏        | 9/79 [00:01<00:10,  6.70it/s][A
 13%|█▎        | 10/79 [00:01<00:10,  6.64it/s][A
 14%|█▍        | 11/79 [00:01<00:10,  6.57it/s][A
 15%|█▌        | 12/79 [00:01<00:10,  6.49it/s][A
 16%|█▋        | 13/79 [00:01<00:10,  6.44it/s][A
 18%|█▊        | 14/79 [00:02<00:10,  6.43it/s][A
 19%|█▉        | 15/79 [00:02<00:09,  6.47it/s][A
 20%|██        | 16/79 [00:02<00:09,  6.50it/s][A
 22%|██▏       | 17/79 [00:02<00:09,  6.51it/s][A
 23%|██▎       | 18/79 [00:02<00:09,  6.56it/s][A
 24%|██▍       | 19/79 [00:02<00:09,  6.55it/s][A
 25%|██▌       | 20/79 [00:02<00:08,  6.56it/s][A
 27%|██▋       | 21/79 [00:03<00:08,  6.54it/s][A
 28%|██▊       | 22/79 [00:03<00:08, 

{'eval_loss': 0.10559582710266113, 'eval_token_accuracy': 0.9567145790554414, 'eval_exact_match_accuracy': 0.6217, 'eval_runtime': 12.3902, 'eval_samples_per_second': 807.091, 'eval_steps_per_second': 6.376, 'epoch': 16.38}


 66%|██████▌   | 128500/195325 [47:53<8:12:02,  2.26it/s]  
  0%|          | 0/79 [00:00<?, ?it/s][A

{'loss': 0.0886, 'grad_norm': 0.7125038504600525, 'learning_rate': 2.6239621509460312e-05, 'epoch': 16.45}



  3%|▎         | 2/79 [00:00<00:05, 13.06it/s][A
  5%|▌         | 4/79 [00:00<00:09,  8.08it/s][A
  6%|▋         | 5/79 [00:00<00:09,  7.54it/s][A
  8%|▊         | 6/79 [00:00<00:10,  7.22it/s][A
  9%|▉         | 7/79 [00:00<00:10,  6.99it/s][A
 10%|█         | 8/79 [00:01<00:10,  6.83it/s][A
 11%|█▏        | 9/79 [00:01<00:10,  6.70it/s][A
 13%|█▎        | 10/79 [00:01<00:10,  6.65it/s][A
 14%|█▍        | 11/79 [00:01<00:10,  6.57it/s][A
 15%|█▌        | 12/79 [00:01<00:10,  6.49it/s][A
 16%|█▋        | 13/79 [00:01<00:10,  6.44it/s][A
 18%|█▊        | 14/79 [00:02<00:10,  6.43it/s][A
 19%|█▉        | 15/79 [00:02<00:09,  6.48it/s][A
 20%|██        | 16/79 [00:02<00:09,  6.50it/s][A
 22%|██▏       | 17/79 [00:02<00:09,  6.51it/s][A
 23%|██▎       | 18/79 [00:02<00:09,  6.55it/s][A
 24%|██▍       | 19/79 [00:02<00:09,  6.54it/s][A
 25%|██▌       | 20/79 [00:02<00:09,  6.55it/s][A
 27%|██▋       | 21/79 [00:03<00:08,  6.53it/s][A
 28%|██▊       | 22/79 [00:03<00:08, 

{'eval_loss': 0.10004551708698273, 'eval_token_accuracy': 0.9568332192562172, 'eval_exact_match_accuracy': 0.6222, 'eval_runtime': 12.5486, 'eval_samples_per_second': 796.905, 'eval_steps_per_second': 6.296, 'epoch': 16.45}


 66%|██████▌   | 129000/195325 [51:59<7:59:20,  2.31it/s]  
  0%|          | 0/79 [00:00<?, ?it/s][A

{'loss': 0.088, 'grad_norm': 1.0778522491455078, 'learning_rate': 2.5886598074269595e-05, 'epoch': 16.51}



  3%|▎         | 2/79 [00:00<00:05, 13.02it/s][A
  5%|▌         | 4/79 [00:00<00:09,  8.07it/s][A
  6%|▋         | 5/79 [00:00<00:09,  7.53it/s][A
  8%|▊         | 6/79 [00:00<00:10,  7.22it/s][A
  9%|▉         | 7/79 [00:00<00:10,  6.99it/s][A
 10%|█         | 8/79 [00:01<00:10,  6.84it/s][A
 11%|█▏        | 9/79 [00:01<00:10,  6.70it/s][A
 13%|█▎        | 10/79 [00:01<00:10,  6.64it/s][A
 14%|█▍        | 11/79 [00:01<00:10,  6.57it/s][A
 15%|█▌        | 12/79 [00:01<00:10,  6.50it/s][A
 16%|█▋        | 13/79 [00:01<00:10,  6.44it/s][A
 18%|█▊        | 14/79 [00:02<00:10,  6.43it/s][A
 19%|█▉        | 15/79 [00:02<00:09,  6.47it/s][A
 20%|██        | 16/79 [00:02<00:09,  6.50it/s][A
 22%|██▏       | 17/79 [00:02<00:09,  6.51it/s][A
 23%|██▎       | 18/79 [00:02<00:09,  6.56it/s][A
 24%|██▍       | 19/79 [00:02<00:09,  6.55it/s][A
 25%|██▌       | 20/79 [00:02<00:08,  6.56it/s][A
 27%|██▋       | 21/79 [00:03<00:08,  6.54it/s][A
 28%|██▊       | 22/79 [00:03<00:08, 

{'eval_loss': 0.1023077666759491, 'eval_token_accuracy': 0.9572986538900297, 'eval_exact_match_accuracy': 0.626, 'eval_runtime': 12.428, 'eval_samples_per_second': 804.636, 'eval_steps_per_second': 6.357, 'epoch': 16.51}


 66%|██████▋   | 129500/195325 [56:05<7:59:56,  2.29it/s]  
  0%|          | 0/79 [00:00<?, ?it/s][A

{'loss': 0.0884, 'grad_norm': 1.437631368637085, 'learning_rate': 2.5535134120598503e-05, 'epoch': 16.57}



  3%|▎         | 2/79 [00:00<00:05, 13.04it/s][A
  5%|▌         | 4/79 [00:00<00:09,  8.08it/s][A
  6%|▋         | 5/79 [00:00<00:09,  7.54it/s][A
  8%|▊         | 6/79 [00:00<00:10,  7.22it/s][A
  9%|▉         | 7/79 [00:00<00:10,  6.99it/s][A
 10%|█         | 8/79 [00:01<00:10,  6.84it/s][A
 11%|█▏        | 9/79 [00:01<00:10,  6.70it/s][A
 13%|█▎        | 10/79 [00:01<00:10,  6.64it/s][A
 14%|█▍        | 11/79 [00:01<00:10,  6.56it/s][A
 15%|█▌        | 12/79 [00:01<00:10,  6.49it/s][A
 16%|█▋        | 13/79 [00:01<00:10,  6.44it/s][A
 18%|█▊        | 14/79 [00:02<00:10,  6.43it/s][A
 19%|█▉        | 15/79 [00:02<00:09,  6.47it/s][A
 20%|██        | 16/79 [00:02<00:09,  6.50it/s][A
 22%|██▏       | 17/79 [00:02<00:09,  6.51it/s][A
 23%|██▎       | 18/79 [00:02<00:09,  6.56it/s][A
 24%|██▍       | 19/79 [00:02<00:09,  6.55it/s][A
 25%|██▌       | 20/79 [00:02<00:08,  6.56it/s][A
 27%|██▋       | 21/79 [00:03<00:08,  6.54it/s][A
 28%|██▊       | 22/79 [00:03<00:08, 

{'eval_loss': 0.10544010996818542, 'eval_token_accuracy': 0.9569974903034452, 'eval_exact_match_accuracy': 0.6242, 'eval_runtime': 12.4274, 'eval_samples_per_second': 804.674, 'eval_steps_per_second': 6.357, 'epoch': 16.57}


 67%|██████▋   | 130000/195325 [1:00:10<7:54:01,  2.30it/s]
  0%|          | 0/79 [00:00<?, ?it/s][A

{'loss': 0.0869, 'grad_norm': 1.498881220817566, 'learning_rate': 2.5185252378609392e-05, 'epoch': 16.64}



  3%|▎         | 2/79 [00:00<00:05, 13.02it/s][A
  5%|▌         | 4/79 [00:00<00:09,  8.07it/s][A
  6%|▋         | 5/79 [00:00<00:09,  7.53it/s][A
  8%|▊         | 6/79 [00:00<00:10,  7.21it/s][A
  9%|▉         | 7/79 [00:00<00:10,  6.98it/s][A
 10%|█         | 8/79 [00:01<00:10,  6.83it/s][A
 11%|█▏        | 9/79 [00:01<00:10,  6.70it/s][A
 13%|█▎        | 10/79 [00:01<00:10,  6.64it/s][A
 14%|█▍        | 11/79 [00:01<00:10,  6.56it/s][A
 15%|█▌        | 12/79 [00:01<00:10,  6.49it/s][A
 16%|█▋        | 13/79 [00:01<00:10,  6.43it/s][A
 18%|█▊        | 14/79 [00:02<00:10,  6.42it/s][A
 19%|█▉        | 15/79 [00:02<00:09,  6.47it/s][A
 20%|██        | 16/79 [00:02<00:09,  6.50it/s][A
 22%|██▏       | 17/79 [00:02<00:09,  6.51it/s][A
 23%|██▎       | 18/79 [00:02<00:09,  6.55it/s][A
 24%|██▍       | 19/79 [00:02<00:09,  6.54it/s][A
 25%|██▌       | 20/79 [00:02<00:09,  6.55it/s][A
 27%|██▋       | 21/79 [00:03<00:08,  6.53it/s][A
 28%|██▊       | 22/79 [00:03<00:08, 

{'eval_loss': 0.10262873768806458, 'eval_token_accuracy': 0.9567784622404746, 'eval_exact_match_accuracy': 0.6246, 'eval_runtime': 12.445, 'eval_samples_per_second': 803.533, 'eval_steps_per_second': 6.348, 'epoch': 16.64}


 67%|██████▋   | 130500/195325 [1:04:19<7:49:41,  2.30it/s]  
  0%|          | 0/79 [00:00<?, ?it/s][A

{'loss': 0.087, 'grad_norm': 1.5458972454071045, 'learning_rate': 2.483767041332517e-05, 'epoch': 16.7}



  3%|▎         | 2/79 [00:00<00:05, 13.03it/s][A
  5%|▌         | 4/79 [00:00<00:09,  8.08it/s][A
  6%|▋         | 5/79 [00:00<00:09,  7.54it/s][A
  8%|▊         | 6/79 [00:00<00:10,  7.22it/s][A
  9%|▉         | 7/79 [00:00<00:10,  6.99it/s][A
 10%|█         | 8/79 [00:01<00:10,  6.84it/s][A
 11%|█▏        | 9/79 [00:01<00:10,  6.70it/s][A
 13%|█▎        | 10/79 [00:01<00:10,  6.64it/s][A
 14%|█▍        | 11/79 [00:01<00:10,  6.57it/s][A
 15%|█▌        | 12/79 [00:01<00:10,  6.50it/s][A
 16%|█▋        | 13/79 [00:01<00:10,  6.45it/s][A
 18%|█▊        | 14/79 [00:02<00:10,  6.43it/s][A
 19%|█▉        | 15/79 [00:02<00:09,  6.48it/s][A
 20%|██        | 16/79 [00:02<00:09,  6.50it/s][A
 22%|██▏       | 17/79 [00:02<00:09,  6.51it/s][A
 23%|██▎       | 18/79 [00:02<00:09,  6.56it/s][A
 24%|██▍       | 19/79 [00:02<00:09,  6.55it/s][A
 25%|██▌       | 20/79 [00:02<00:08,  6.56it/s][A
 27%|██▋       | 21/79 [00:03<00:08,  6.54it/s][A
 28%|██▊       | 22/79 [00:03<00:08, 

{'eval_loss': 0.09773968160152435, 'eval_token_accuracy': 0.9580378736025553, 'eval_exact_match_accuracy': 0.6291, 'eval_runtime': 12.4484, 'eval_samples_per_second': 803.314, 'eval_steps_per_second': 6.346, 'epoch': 16.7}


 67%|██████▋   | 131000/195325 [1:08:27<7:45:14,  2.30it/s]  
  0%|          | 0/79 [00:00<?, ?it/s][A

{'loss': 0.0875, 'grad_norm': 1.0886317491531372, 'learning_rate': 2.449170926393212e-05, 'epoch': 16.77}



  3%|▎         | 2/79 [00:00<00:05, 13.03it/s][A
  5%|▌         | 4/79 [00:00<00:09,  8.08it/s][A
  6%|▋         | 5/79 [00:00<00:09,  7.54it/s][A
  8%|▊         | 6/79 [00:00<00:10,  7.22it/s][A
  9%|▉         | 7/79 [00:00<00:10,  6.99it/s][A
 10%|█         | 8/79 [00:01<00:10,  6.84it/s][A
 11%|█▏        | 9/79 [00:01<00:10,  6.70it/s][A
 13%|█▎        | 10/79 [00:01<00:10,  6.64it/s][A
 14%|█▍        | 11/79 [00:01<00:10,  6.57it/s][A
 15%|█▌        | 12/79 [00:01<00:10,  6.48it/s][A
 16%|█▋        | 13/79 [00:01<00:10,  6.44it/s][A
 18%|█▊        | 14/79 [00:02<00:10,  6.43it/s][A
 19%|█▉        | 15/79 [00:02<00:09,  6.47it/s][A
 20%|██        | 16/79 [00:02<00:09,  6.50it/s][A
 22%|██▏       | 17/79 [00:02<00:09,  6.51it/s][A
 23%|██▎       | 18/79 [00:02<00:09,  6.56it/s][A
 24%|██▍       | 19/79 [00:02<00:09,  6.55it/s][A
 25%|██▌       | 20/79 [00:02<00:08,  6.56it/s][A
 27%|██▋       | 21/79 [00:03<00:08,  6.54it/s][A
 28%|██▊       | 22/79 [00:03<00:08, 

{'eval_loss': 0.10607296228408813, 'eval_token_accuracy': 0.9565959388546658, 'eval_exact_match_accuracy': 0.6248, 'eval_runtime': 12.4879, 'eval_samples_per_second': 800.774, 'eval_steps_per_second': 6.326, 'epoch': 16.77}


 67%|██████▋   | 131500/195325 [1:12:38<7:47:36,  2.27it/s]  
  0%|          | 0/79 [00:00<?, ?it/s][A

{'loss': 0.0871, 'grad_norm': 1.2698718309402466, 'learning_rate': 2.414670286374509e-05, 'epoch': 16.83}



  3%|▎         | 2/79 [00:00<00:05, 13.05it/s][A
  5%|▌         | 4/79 [00:00<00:09,  8.07it/s][A
  6%|▋         | 5/79 [00:00<00:09,  7.53it/s][A
  8%|▊         | 6/79 [00:00<00:10,  7.21it/s][A
  9%|▉         | 7/79 [00:00<00:10,  6.98it/s][A
 10%|█         | 8/79 [00:01<00:10,  6.83it/s][A
 11%|█▏        | 9/79 [00:01<00:10,  6.69it/s][A
 13%|█▎        | 10/79 [00:01<00:10,  6.63it/s][A
 14%|█▍        | 11/79 [00:01<00:10,  6.56it/s][A
 15%|█▌        | 12/79 [00:01<00:10,  6.49it/s][A
 16%|█▋        | 13/79 [00:01<00:10,  6.44it/s][A
 18%|█▊        | 14/79 [00:02<00:10,  6.42it/s][A
 19%|█▉        | 15/79 [00:02<00:09,  6.47it/s][A
 20%|██        | 16/79 [00:02<00:09,  6.50it/s][A
 22%|██▏       | 17/79 [00:02<00:09,  6.50it/s][A
 23%|██▎       | 18/79 [00:02<00:09,  6.55it/s][A
 24%|██▍       | 19/79 [00:02<00:09,  6.54it/s][A
 25%|██▌       | 20/79 [00:02<00:09,  6.55it/s][A
 27%|██▋       | 21/79 [00:03<00:08,  6.53it/s][A
 28%|██▊       | 22/79 [00:03<00:08, 

{'eval_loss': 0.10783631354570389, 'eval_token_accuracy': 0.9575450604608715, 'eval_exact_match_accuracy': 0.6271, 'eval_runtime': 12.389, 'eval_samples_per_second': 807.165, 'eval_steps_per_second': 6.377, 'epoch': 16.83}


 68%|██████▊   | 132000/195325 [1:16:47<7:41:42,  2.29it/s]  
  0%|          | 0/79 [00:00<?, ?it/s][A

{'loss': 0.0861, 'grad_norm': 0.8787361979484558, 'learning_rate': 2.380336846899428e-05, 'epoch': 16.89}



  3%|▎         | 2/79 [00:00<00:05, 13.06it/s][A
  5%|▌         | 4/79 [00:00<00:09,  8.07it/s][A
  6%|▋         | 5/79 [00:00<00:09,  7.54it/s][A
  8%|▊         | 6/79 [00:00<00:10,  7.23it/s][A
  9%|▉         | 7/79 [00:00<00:10,  7.00it/s][A
 10%|█         | 8/79 [00:01<00:10,  6.85it/s][A
 11%|█▏        | 9/79 [00:01<00:10,  6.71it/s][A
 13%|█▎        | 10/79 [00:01<00:10,  6.65it/s][A
 14%|█▍        | 11/79 [00:01<00:10,  6.58it/s][A
 15%|█▌        | 12/79 [00:01<00:10,  6.50it/s][A
 16%|█▋        | 13/79 [00:01<00:10,  6.45it/s][A
 18%|█▊        | 14/79 [00:02<00:10,  6.44it/s][A
 19%|█▉        | 15/79 [00:02<00:09,  6.49it/s][A
 20%|██        | 16/79 [00:02<00:09,  6.51it/s][A
 22%|██▏       | 17/79 [00:02<00:09,  6.52it/s][A
 23%|██▎       | 18/79 [00:02<00:09,  6.57it/s][A
 24%|██▍       | 19/79 [00:02<00:09,  6.56it/s][A
 25%|██▌       | 20/79 [00:02<00:08,  6.57it/s][A
 27%|██▋       | 21/79 [00:03<00:08,  6.55it/s][A
 28%|██▊       | 22/79 [00:03<00:08, 

{'eval_loss': 0.10034206509590149, 'eval_token_accuracy': 0.9583390371891398, 'eval_exact_match_accuracy': 0.6295, 'eval_runtime': 12.355, 'eval_samples_per_second': 809.39, 'eval_steps_per_second': 6.394, 'epoch': 16.89}


 68%|██████▊   | 132500/195325 [1:20:49<7:38:09,  2.29it/s]  
  0%|          | 0/79 [00:00<?, ?it/s][A

{'loss': 0.0856, 'grad_norm': 0.9581685662269592, 'learning_rate': 2.3461728284080565e-05, 'epoch': 16.96}



  3%|▎         | 2/79 [00:00<00:05, 13.05it/s][A
  5%|▌         | 4/79 [00:00<00:09,  8.08it/s][A
  6%|▋         | 5/79 [00:00<00:09,  7.54it/s][A
  8%|▊         | 6/79 [00:00<00:10,  7.23it/s][A
  9%|▉         | 7/79 [00:00<00:10,  7.00it/s][A
 10%|█         | 8/79 [00:01<00:10,  6.85it/s][A
 11%|█▏        | 9/79 [00:01<00:10,  6.71it/s][A
 13%|█▎        | 10/79 [00:01<00:10,  6.65it/s][A
 14%|█▍        | 11/79 [00:01<00:10,  6.58it/s][A
 15%|█▌        | 12/79 [00:01<00:10,  6.51it/s][A
 16%|█▋        | 13/79 [00:01<00:10,  6.46it/s][A
 18%|█▊        | 14/79 [00:02<00:10,  6.45it/s][A
 19%|█▉        | 15/79 [00:02<00:09,  6.49it/s][A
 20%|██        | 16/79 [00:02<00:09,  6.52it/s][A
 22%|██▏       | 17/79 [00:02<00:09,  6.52it/s][A
 23%|██▎       | 18/79 [00:02<00:09,  6.57it/s][A
 24%|██▍       | 19/79 [00:02<00:09,  6.56it/s][A
 25%|██▌       | 20/79 [00:02<00:08,  6.57it/s][A
 27%|██▋       | 21/79 [00:03<00:08,  6.55it/s][A
 28%|██▊       | 22/79 [00:03<00:08, 

{'eval_loss': 0.10246473550796509, 'eval_token_accuracy': 0.957472051106548, 'eval_exact_match_accuracy': 0.6243, 'eval_runtime': 12.5305, 'eval_samples_per_second': 798.052, 'eval_steps_per_second': 6.305, 'epoch': 16.96}


 68%|██████▊   | 133000/195325 [1:25:02<7:31:51,  2.30it/s]  
  0%|          | 0/79 [00:00<?, ?it/s][A

{'loss': 0.0856, 'grad_norm': 0.9383909702301025, 'learning_rate': 2.312180440383558e-05, 'epoch': 17.02}



  3%|▎         | 2/79 [00:00<00:05, 13.06it/s][A
  5%|▌         | 4/79 [00:00<00:09,  8.08it/s][A
  6%|▋         | 5/79 [00:00<00:09,  7.54it/s][A
  8%|▊         | 6/79 [00:00<00:10,  7.23it/s][A
  9%|▉         | 7/79 [00:00<00:10,  7.00it/s][A
 10%|█         | 8/79 [00:01<00:10,  6.85it/s][A
 11%|█▏        | 9/79 [00:01<00:10,  6.71it/s][A
 13%|█▎        | 10/79 [00:01<00:10,  6.65it/s][A
 14%|█▍        | 11/79 [00:01<00:10,  6.58it/s][A
 15%|█▌        | 12/79 [00:01<00:10,  6.50it/s][A
 16%|█▋        | 13/79 [00:01<00:10,  6.45it/s][A
 18%|█▊        | 14/79 [00:02<00:10,  6.44it/s][A
 19%|█▉        | 15/79 [00:02<00:09,  6.49it/s][A
 20%|██        | 16/79 [00:02<00:09,  6.52it/s][A
 22%|██▏       | 17/79 [00:02<00:09,  6.52it/s][A
 23%|██▎       | 18/79 [00:02<00:09,  6.57it/s][A
 24%|██▍       | 19/79 [00:02<00:09,  6.56it/s][A
 25%|██▌       | 20/79 [00:02<00:08,  6.57it/s][A
 27%|██▋       | 21/79 [00:03<00:08,  6.55it/s][A
 28%|██▊       | 22/79 [00:03<00:08, 

{'eval_loss': 0.09797608852386475, 'eval_token_accuracy': 0.9586402007757244, 'eval_exact_match_accuracy': 0.6334, 'eval_runtime': 12.363, 'eval_samples_per_second': 808.863, 'eval_steps_per_second': 6.39, 'epoch': 17.02}


 68%|██████▊   | 133500/195325 [1:29:05<7:35:27,  2.26it/s]  
  0%|          | 0/79 [00:00<?, ?it/s][A

{'loss': 0.085, 'grad_norm': 1.507932186126709, 'learning_rate': 2.2783618812092637e-05, 'epoch': 17.09}



  3%|▎         | 2/79 [00:00<00:05, 13.05it/s][A
  5%|▌         | 4/79 [00:00<00:09,  8.08it/s][A
  6%|▋         | 5/79 [00:00<00:09,  7.55it/s][A
  8%|▊         | 6/79 [00:00<00:10,  7.23it/s][A
  9%|▉         | 7/79 [00:00<00:10,  7.00it/s][A
 10%|█         | 8/79 [00:01<00:10,  6.85it/s][A
 11%|█▏        | 9/79 [00:01<00:10,  6.71it/s][A
 13%|█▎        | 10/79 [00:01<00:10,  6.65it/s][A
 14%|█▍        | 11/79 [00:01<00:10,  6.58it/s][A
 15%|█▌        | 12/79 [00:01<00:10,  6.51it/s][A
 16%|█▋        | 13/79 [00:01<00:10,  6.45it/s][A
 18%|█▊        | 14/79 [00:02<00:10,  6.44it/s][A
 19%|█▉        | 15/79 [00:02<00:09,  6.49it/s][A
 20%|██        | 16/79 [00:02<00:09,  6.51it/s][A
 22%|██▏       | 17/79 [00:02<00:09,  6.52it/s][A
 23%|██▎       | 18/79 [00:02<00:09,  6.57it/s][A
 24%|██▍       | 19/79 [00:02<00:09,  6.55it/s][A
 25%|██▌       | 20/79 [00:02<00:08,  6.57it/s][A
 27%|██▋       | 21/79 [00:03<00:08,  6.55it/s][A
 28%|██▊       | 22/79 [00:03<00:08, 

{'eval_loss': 0.10821042209863663, 'eval_token_accuracy': 0.9574172940908053, 'eval_exact_match_accuracy': 0.6266, 'eval_runtime': 12.3659, 'eval_samples_per_second': 808.677, 'eval_steps_per_second': 6.389, 'epoch': 17.09}


 69%|██████▊   | 134000/195325 [1:33:06<7:26:44,  2.29it/s]  
  0%|          | 0/79 [00:00<?, ?it/s][A

{'loss': 0.0855, 'grad_norm': 1.0772370100021362, 'learning_rate': 2.2447193380265076e-05, 'epoch': 17.15}



  3%|▎         | 2/79 [00:00<00:05, 13.06it/s][A
  5%|▌         | 4/79 [00:00<00:09,  8.08it/s][A
  6%|▋         | 5/79 [00:00<00:09,  7.54it/s][A
  8%|▊         | 6/79 [00:00<00:10,  7.23it/s][A
  9%|▉         | 7/79 [00:00<00:10,  7.00it/s][A
 10%|█         | 8/79 [00:01<00:10,  6.85it/s][A
 11%|█▏        | 9/79 [00:01<00:10,  6.71it/s][A
 13%|█▎        | 10/79 [00:01<00:10,  6.66it/s][A
 14%|█▍        | 11/79 [00:01<00:10,  6.58it/s][A
 15%|█▌        | 12/79 [00:01<00:10,  6.50it/s][A
 16%|█▋        | 13/79 [00:01<00:10,  6.45it/s][A
 18%|█▊        | 14/79 [00:02<00:10,  6.44it/s][A
 19%|█▉        | 15/79 [00:02<00:09,  6.49it/s][A
 20%|██        | 16/79 [00:02<00:09,  6.51it/s][A
 22%|██▏       | 17/79 [00:02<00:09,  6.52it/s][A
 23%|██▎       | 18/79 [00:02<00:09,  6.57it/s][A
 24%|██▍       | 19/79 [00:02<00:09,  6.56it/s][A
 25%|██▌       | 20/79 [00:02<00:08,  6.57it/s][A
 27%|██▋       | 21/79 [00:03<00:08,  6.55it/s][A
 28%|██▊       | 22/79 [00:03<00:08, 

{'eval_loss': 0.10826315730810165, 'eval_token_accuracy': 0.9561396303901437, 'eval_exact_match_accuracy': 0.6247, 'eval_runtime': 12.381, 'eval_samples_per_second': 807.688, 'eval_steps_per_second': 6.381, 'epoch': 17.15}


 69%|██████▉   | 134500/195325 [1:37:09<7:23:40,  2.28it/s]  
  0%|          | 0/79 [00:00<?, ?it/s][A

{'loss': 0.0843, 'grad_norm': 0.5221109390258789, 'learning_rate': 2.2112549865931753e-05, 'epoch': 17.21}



  3%|▎         | 2/79 [00:00<00:05, 13.05it/s][A
  5%|▌         | 4/79 [00:00<00:09,  8.08it/s][A
  6%|▋         | 5/79 [00:00<00:09,  7.54it/s][A
  8%|▊         | 6/79 [00:00<00:10,  7.23it/s][A
  9%|▉         | 7/79 [00:00<00:10,  7.00it/s][A
 10%|█         | 8/79 [00:01<00:10,  6.85it/s][A
 11%|█▏        | 9/79 [00:01<00:10,  6.71it/s][A
 13%|█▎        | 10/79 [00:01<00:10,  6.65it/s][A
 14%|█▍        | 11/79 [00:01<00:10,  6.58it/s][A
 15%|█▌        | 12/79 [00:01<00:10,  6.51it/s][A
 16%|█▋        | 13/79 [00:01<00:10,  6.45it/s][A
 18%|█▊        | 14/79 [00:02<00:10,  6.44it/s][A
 19%|█▉        | 15/79 [00:02<00:09,  6.49it/s][A
 20%|██        | 16/79 [00:02<00:09,  6.52it/s][A
 22%|██▏       | 17/79 [00:02<00:09,  6.52it/s][A
 23%|██▎       | 18/79 [00:02<00:09,  6.57it/s][A
 24%|██▍       | 19/79 [00:02<00:09,  6.56it/s][A
 25%|██▌       | 20/79 [00:02<00:08,  6.57it/s][A
 27%|██▋       | 21/79 [00:03<00:08,  6.56it/s][A
 28%|██▊       | 22/79 [00:03<00:08, 

{'eval_loss': 0.11297963559627533, 'eval_token_accuracy': 0.9565229295003422, 'eval_exact_match_accuracy': 0.6259, 'eval_runtime': 12.4071, 'eval_samples_per_second': 805.99, 'eval_steps_per_second': 6.367, 'epoch': 17.21}


 69%|██████▉   | 135000/195325 [1:41:11<7:20:19,  2.28it/s]  
  0%|          | 0/79 [00:00<?, ?it/s][A

{'loss': 0.0836, 'grad_norm': 0.9244815111160278, 'learning_rate': 2.1779709911429913e-05, 'epoch': 17.28}



  3%|▎         | 2/79 [00:00<00:05, 13.05it/s][A
  5%|▌         | 4/79 [00:00<00:09,  8.08it/s][A
  6%|▋         | 5/79 [00:00<00:09,  7.54it/s][A
  8%|▊         | 6/79 [00:00<00:10,  7.22it/s][A
  9%|▉         | 7/79 [00:00<00:10,  7.00it/s][A
 10%|█         | 8/79 [00:01<00:10,  6.85it/s][A
 11%|█▏        | 9/79 [00:01<00:10,  6.71it/s][A
 13%|█▎        | 10/79 [00:01<00:10,  6.65it/s][A
 14%|█▍        | 11/79 [00:01<00:10,  6.58it/s][A
 15%|█▌        | 12/79 [00:01<00:10,  6.50it/s][A
 16%|█▋        | 13/79 [00:01<00:10,  6.45it/s][A
 18%|█▊        | 14/79 [00:02<00:10,  6.44it/s][A
 19%|█▉        | 15/79 [00:02<00:09,  6.49it/s][A
 20%|██        | 16/79 [00:02<00:09,  6.51it/s][A
 22%|██▏       | 17/79 [00:02<00:09,  6.52it/s][A
 23%|██▎       | 18/79 [00:02<00:09,  6.57it/s][A
 24%|██▍       | 19/79 [00:02<00:09,  6.55it/s][A
 25%|██▌       | 20/79 [00:02<00:08,  6.56it/s][A
 27%|██▋       | 21/79 [00:03<00:08,  6.55it/s][A
 28%|██▊       | 22/79 [00:03<00:08, 

{'eval_loss': 0.10353191196918488, 'eval_token_accuracy': 0.9577367100159708, 'eval_exact_match_accuracy': 0.6302, 'eval_runtime': 12.3704, 'eval_samples_per_second': 808.384, 'eval_steps_per_second': 6.386, 'epoch': 17.28}


 69%|██████▉   | 135500/195325 [1:45:13<7:15:06,  2.29it/s]  
  0%|          | 0/79 [00:00<?, ?it/s][A

{'loss': 0.0845, 'grad_norm': 0.9783747792243958, 'learning_rate': 2.144869504245547e-05, 'epoch': 17.34}



  3%|▎         | 2/79 [00:00<00:05, 13.05it/s][A
  5%|▌         | 4/79 [00:00<00:09,  8.08it/s][A
  6%|▋         | 5/79 [00:00<00:09,  7.54it/s][A
  8%|▊         | 6/79 [00:00<00:10,  7.23it/s][A
  9%|▉         | 7/79 [00:00<00:10,  7.00it/s][A
 10%|█         | 8/79 [00:01<00:10,  6.85it/s][A
 11%|█▏        | 9/79 [00:01<00:10,  6.71it/s][A
 13%|█▎        | 10/79 [00:01<00:10,  6.66it/s][A
 14%|█▍        | 11/79 [00:01<00:10,  6.58it/s][A
 15%|█▌        | 12/79 [00:01<00:10,  6.51it/s][A
 16%|█▋        | 13/79 [00:01<00:10,  6.45it/s][A
 18%|█▊        | 14/79 [00:02<00:10,  6.44it/s][A
 19%|█▉        | 15/79 [00:02<00:09,  6.49it/s][A
 20%|██        | 16/79 [00:02<00:09,  6.52it/s][A
 22%|██▏       | 17/79 [00:02<00:09,  6.52it/s][A
 23%|██▎       | 18/79 [00:02<00:09,  6.57it/s][A
 24%|██▍       | 19/79 [00:02<00:09,  6.56it/s][A
 25%|██▌       | 20/79 [00:02<00:08,  6.57it/s][A
 27%|██▋       | 21/79 [00:03<00:08,  6.55it/s][A
 28%|██▊       | 22/79 [00:03<00:08, 

{'eval_loss': 0.0980217233300209, 'eval_token_accuracy': 0.9577549623545517, 'eval_exact_match_accuracy': 0.6301, 'eval_runtime': 12.361, 'eval_samples_per_second': 808.994, 'eval_steps_per_second': 6.391, 'epoch': 17.34}


 70%|██████▉   | 136000/195325 [1:49:16<7:07:30,  2.31it/s]  
  0%|          | 0/79 [00:00<?, ?it/s][A

{'loss': 0.0837, 'grad_norm': 0.7851420640945435, 'learning_rate': 2.111952666667099e-05, 'epoch': 17.41}



  3%|▎         | 2/79 [00:00<00:05, 13.04it/s][A
  5%|▌         | 4/79 [00:00<00:09,  8.08it/s][A
  6%|▋         | 5/79 [00:00<00:09,  7.54it/s][A
  8%|▊         | 6/79 [00:00<00:10,  7.20it/s][A
  9%|▉         | 7/79 [00:00<00:10,  6.98it/s][A
 10%|█         | 8/79 [00:01<00:10,  6.84it/s][A
 11%|█▏        | 9/79 [00:01<00:10,  6.70it/s][A
 13%|█▎        | 10/79 [00:01<00:10,  6.65it/s][A
 14%|█▍        | 11/79 [00:01<00:10,  6.57it/s][A
 15%|█▌        | 12/79 [00:01<00:10,  6.50it/s][A
 16%|█▋        | 13/79 [00:01<00:10,  6.45it/s][A
 18%|█▊        | 14/79 [00:02<00:10,  6.44it/s][A
 19%|█▉        | 15/79 [00:02<00:09,  6.49it/s][A
 20%|██        | 16/79 [00:02<00:09,  6.51it/s][A
 22%|██▏       | 17/79 [00:02<00:09,  6.52it/s][A
 23%|██▎       | 18/79 [00:02<00:09,  6.57it/s][A
 24%|██▍       | 19/79 [00:02<00:09,  6.56it/s][A
 25%|██▌       | 20/79 [00:02<00:08,  6.57it/s][A
 27%|██▋       | 21/79 [00:03<00:08,  6.55it/s][A
 28%|██▊       | 22/79 [00:03<00:08, 

{'eval_loss': 0.10126087814569473, 'eval_token_accuracy': 0.9582660278348163, 'eval_exact_match_accuracy': 0.6303, 'eval_runtime': 12.4506, 'eval_samples_per_second': 803.173, 'eval_steps_per_second': 6.345, 'epoch': 17.41}


 70%|██████▉   | 136500/195325 [1:53:19<7:07:20,  2.29it/s]  
  0%|          | 0/79 [00:00<?, ?it/s][A

{'loss': 0.083, 'grad_norm': 0.9443227648735046, 'learning_rate': 2.079287879537432e-05, 'epoch': 17.47}



  3%|▎         | 2/79 [00:00<00:05, 13.04it/s][A
  5%|▌         | 4/79 [00:00<00:09,  8.08it/s][A
  6%|▋         | 5/79 [00:00<00:09,  7.54it/s][A
  8%|▊         | 6/79 [00:00<00:10,  7.23it/s][A
  9%|▉         | 7/79 [00:00<00:10,  7.00it/s][A
 10%|█         | 8/79 [00:01<00:10,  6.85it/s][A
 11%|█▏        | 9/79 [00:01<00:10,  6.71it/s][A
 13%|█▎        | 10/79 [00:01<00:10,  6.65it/s][A
 14%|█▍        | 11/79 [00:01<00:10,  6.58it/s][A
 15%|█▌        | 12/79 [00:01<00:10,  6.50it/s][A
 16%|█▋        | 13/79 [00:01<00:10,  6.45it/s][A
 18%|█▊        | 14/79 [00:02<00:10,  6.43it/s][A
 19%|█▉        | 15/79 [00:02<00:09,  6.48it/s][A
 20%|██        | 16/79 [00:02<00:09,  6.51it/s][A
 22%|██▏       | 17/79 [00:02<00:09,  6.52it/s][A
 23%|██▎       | 18/79 [00:02<00:09,  6.57it/s][A
 24%|██▍       | 19/79 [00:02<00:09,  6.56it/s][A
 25%|██▌       | 20/79 [00:02<00:08,  6.57it/s][A
 27%|██▋       | 21/79 [00:03<00:08,  6.55it/s][A
 28%|██▊       | 22/79 [00:03<00:08, 

{'eval_loss': 0.10313377529382706, 'eval_token_accuracy': 0.9577367100159708, 'eval_exact_match_accuracy': 0.6329, 'eval_runtime': 12.3586, 'eval_samples_per_second': 809.152, 'eval_steps_per_second': 6.392, 'epoch': 17.47}


 70%|███████   | 137000/195325 [1:57:20<7:04:54,  2.29it/s]  
  0%|          | 0/79 [00:00<?, ?it/s][A

{'loss': 0.0832, 'grad_norm': 1.1777393817901611, 'learning_rate': 2.0467463350967003e-05, 'epoch': 17.53}



  3%|▎         | 2/79 [00:00<00:05, 13.05it/s][A
  5%|▌         | 4/79 [00:00<00:09,  8.08it/s][A
  6%|▋         | 5/79 [00:00<00:09,  7.54it/s][A
  8%|▊         | 6/79 [00:00<00:10,  7.23it/s][A
  9%|▉         | 7/79 [00:00<00:10,  7.00it/s][A
 10%|█         | 8/79 [00:01<00:10,  6.85it/s][A
 11%|█▏        | 9/79 [00:01<00:10,  6.71it/s][A
 13%|█▎        | 10/79 [00:01<00:10,  6.66it/s][A
 14%|█▍        | 11/79 [00:01<00:10,  6.58it/s][A
 15%|█▌        | 12/79 [00:01<00:10,  6.51it/s][A
 16%|█▋        | 13/79 [00:01<00:10,  6.46it/s][A
 18%|█▊        | 14/79 [00:02<00:10,  6.44it/s][A
 19%|█▉        | 15/79 [00:02<00:09,  6.49it/s][A
 20%|██        | 16/79 [00:02<00:09,  6.52it/s][A
 22%|██▏       | 17/79 [00:02<00:09,  6.52it/s][A
 23%|██▎       | 18/79 [00:02<00:09,  6.58it/s][A
 24%|██▍       | 19/79 [00:02<00:09,  6.56it/s][A
 25%|██▌       | 20/79 [00:02<00:08,  6.57it/s][A
 27%|██▋       | 21/79 [00:03<00:08,  6.55it/s][A
 28%|██▊       | 22/79 [00:03<00:08, 

{'eval_loss': 0.09857185930013657, 'eval_token_accuracy': 0.9581930184804928, 'eval_exact_match_accuracy': 0.6277, 'eval_runtime': 12.3534, 'eval_samples_per_second': 809.495, 'eval_steps_per_second': 6.395, 'epoch': 17.53}


 70%|███████   | 137500/195325 [2:01:22<7:03:45,  2.27it/s]  
  0%|          | 0/79 [00:00<?, ?it/s][A

{'loss': 0.0824, 'grad_norm': 1.021289348602295, 'learning_rate': 2.0143957858762742e-05, 'epoch': 17.6}



  3%|▎         | 2/79 [00:00<00:05, 13.06it/s][A
  5%|▌         | 4/79 [00:00<00:09,  8.08it/s][A
  6%|▋         | 5/79 [00:00<00:09,  7.55it/s][A
  8%|▊         | 6/79 [00:00<00:10,  7.23it/s][A
  9%|▉         | 7/79 [00:00<00:10,  7.00it/s][A
 10%|█         | 8/79 [00:01<00:10,  6.85it/s][A
 11%|█▏        | 9/79 [00:01<00:10,  6.71it/s][A
 13%|█▎        | 10/79 [00:01<00:10,  6.65it/s][A
 14%|█▍        | 11/79 [00:01<00:10,  6.58it/s][A
 15%|█▌        | 12/79 [00:01<00:10,  6.50it/s][A
 16%|█▋        | 13/79 [00:01<00:10,  6.46it/s][A
 18%|█▊        | 14/79 [00:02<00:10,  6.45it/s][A
 19%|█▉        | 15/79 [00:02<00:09,  6.49it/s][A
 20%|██        | 16/79 [00:02<00:09,  6.52it/s][A
 22%|██▏       | 17/79 [00:02<00:09,  6.52it/s][A
 23%|██▎       | 18/79 [00:02<00:09,  6.57it/s][A
 24%|██▍       | 19/79 [00:02<00:09,  6.56it/s][A
 25%|██▌       | 20/79 [00:02<00:08,  6.57it/s][A
 27%|██▋       | 21/79 [00:03<00:08,  6.55it/s][A
 28%|██▊       | 22/79 [00:03<00:08, 

{'eval_loss': 0.09662237018346786, 'eval_token_accuracy': 0.9589322381930184, 'eval_exact_match_accuracy': 0.6309, 'eval_runtime': 12.3586, 'eval_samples_per_second': 809.151, 'eval_steps_per_second': 6.392, 'epoch': 17.6}


 71%|███████   | 138000/195325 [2:05:22<6:58:16,  2.28it/s]  
  0%|          | 0/79 [00:00<?, ?it/s][A

{'loss': 0.0833, 'grad_norm': 0.8885197043418884, 'learning_rate': 1.9822383240771563e-05, 'epoch': 17.66}



  3%|▎         | 2/79 [00:00<00:05, 13.04it/s][A
  5%|▌         | 4/79 [00:00<00:09,  8.08it/s][A
  6%|▋         | 5/79 [00:00<00:09,  7.54it/s][A
  8%|▊         | 6/79 [00:00<00:10,  7.23it/s][A
  9%|▉         | 7/79 [00:00<00:10,  7.00it/s][A
 10%|█         | 8/79 [00:01<00:10,  6.85it/s][A
 11%|█▏        | 9/79 [00:01<00:10,  6.71it/s][A
 13%|█▎        | 10/79 [00:01<00:10,  6.65it/s][A
 14%|█▍        | 11/79 [00:01<00:10,  6.58it/s][A
 15%|█▌        | 12/79 [00:01<00:10,  6.51it/s][A
 16%|█▋        | 13/79 [00:01<00:10,  6.45it/s][A
 18%|█▊        | 14/79 [00:02<00:10,  6.44it/s][A
 19%|█▉        | 15/79 [00:02<00:09,  6.48it/s][A
 20%|██        | 16/79 [00:02<00:09,  6.52it/s][A
 22%|██▏       | 17/79 [00:02<00:09,  6.52it/s][A
 23%|██▎       | 18/79 [00:02<00:09,  6.57it/s][A
 24%|██▍       | 19/79 [00:02<00:09,  6.56it/s][A
 25%|██▌       | 20/79 [00:02<00:08,  6.57it/s][A
 27%|██▋       | 21/79 [00:03<00:08,  6.55it/s][A
 28%|██▊       | 22/79 [00:03<00:08, 

{'eval_loss': 0.11630187928676605, 'eval_token_accuracy': 0.9558384668035592, 'eval_exact_match_accuracy': 0.6198, 'eval_runtime': 12.3734, 'eval_samples_per_second': 808.183, 'eval_steps_per_second': 6.385, 'epoch': 17.66}


 71%|███████   | 138500/195325 [2:09:24<6:52:44,  2.29it/s]  
  0%|          | 0/79 [00:00<?, ?it/s][A

{'loss': 0.0815, 'grad_norm': 0.7502596974372864, 'learning_rate': 1.9502760294128436e-05, 'epoch': 17.73}



  3%|▎         | 2/79 [00:00<00:05, 13.06it/s][A
  5%|▌         | 4/79 [00:00<00:09,  8.07it/s][A
  6%|▋         | 5/79 [00:00<00:09,  7.53it/s][A
  8%|▊         | 6/79 [00:00<00:10,  7.22it/s][A
  9%|▉         | 7/79 [00:00<00:10,  6.99it/s][A
 10%|█         | 8/79 [00:01<00:10,  6.84it/s][A
 11%|█▏        | 9/79 [00:01<00:10,  6.71it/s][A
 13%|█▎        | 10/79 [00:01<00:10,  6.65it/s][A
 14%|█▍        | 11/79 [00:01<00:10,  6.57it/s][A
 15%|█▌        | 12/79 [00:01<00:10,  6.50it/s][A
 16%|█▋        | 13/79 [00:01<00:10,  6.45it/s][A
 18%|█▊        | 14/79 [00:02<00:10,  6.44it/s][A
 19%|█▉        | 15/79 [00:02<00:09,  6.49it/s][A
 20%|██        | 16/79 [00:02<00:09,  6.51it/s][A
 22%|██▏       | 17/79 [00:02<00:09,  6.51it/s][A
 23%|██▎       | 18/79 [00:02<00:09,  6.56it/s][A
 24%|██▍       | 19/79 [00:02<00:09,  6.55it/s][A
 25%|██▌       | 20/79 [00:02<00:08,  6.56it/s][A
 27%|██▋       | 21/79 [00:03<00:08,  6.55it/s][A
 28%|██▊       | 22/79 [00:03<00:08, 

{'eval_loss': 0.10840527713298798, 'eval_token_accuracy': 0.9566506958704084, 'eval_exact_match_accuracy': 0.628, 'eval_runtime': 12.362, 'eval_samples_per_second': 808.929, 'eval_steps_per_second': 6.391, 'epoch': 17.73}


 71%|███████   | 139000/195325 [2:13:25<6:49:36,  2.29it/s]  
  0%|          | 0/79 [00:00<?, ?it/s][A

{'loss': 0.0814, 'grad_norm': 1.5360536575317383, 'learning_rate': 1.9185109689748204e-05, 'epoch': 17.79}



  3%|▎         | 2/79 [00:00<00:05, 13.04it/s][A
  5%|▌         | 4/79 [00:00<00:09,  8.07it/s][A
  6%|▋         | 5/79 [00:00<00:09,  7.54it/s][A
  8%|▊         | 6/79 [00:00<00:10,  7.23it/s][A
  9%|▉         | 7/79 [00:00<00:10,  7.00it/s][A
 10%|█         | 8/79 [00:01<00:10,  6.85it/s][A
 11%|█▏        | 9/79 [00:01<00:10,  6.70it/s][A
 13%|█▎        | 10/79 [00:01<00:10,  6.65it/s][A
 14%|█▍        | 11/79 [00:01<00:10,  6.57it/s][A
 15%|█▌        | 12/79 [00:01<00:10,  6.50it/s][A
 16%|█▋        | 13/79 [00:01<00:10,  6.45it/s][A
 18%|█▊        | 14/79 [00:02<00:10,  6.44it/s][A
 19%|█▉        | 15/79 [00:02<00:09,  6.48it/s][A
 20%|██        | 16/79 [00:02<00:09,  6.51it/s][A
 22%|██▏       | 17/79 [00:02<00:09,  6.52it/s][A
 23%|██▎       | 18/79 [00:02<00:09,  6.57it/s][A
 24%|██▍       | 19/79 [00:02<00:09,  6.55it/s][A
 25%|██▌       | 20/79 [00:02<00:08,  6.57it/s][A
 27%|██▋       | 21/79 [00:03<00:08,  6.55it/s][A
 28%|██▊       | 22/79 [00:03<00:08, 

{'eval_loss': 0.09940365701913834, 'eval_token_accuracy': 0.9583755418663016, 'eval_exact_match_accuracy': 0.6297, 'eval_runtime': 12.3645, 'eval_samples_per_second': 808.767, 'eval_steps_per_second': 6.389, 'epoch': 17.79}


 71%|███████▏  | 139500/195325 [2:17:26<6:47:00,  2.29it/s]  
  0%|          | 0/79 [00:00<?, ?it/s][A

{'loss': 0.0807, 'grad_norm': 1.0121451616287231, 'learning_rate': 1.886945197098885e-05, 'epoch': 17.85}



  3%|▎         | 2/79 [00:00<00:05, 13.05it/s][A
  5%|▌         | 4/79 [00:00<00:09,  8.07it/s][A
  6%|▋         | 5/79 [00:00<00:09,  7.53it/s][A
  8%|▊         | 6/79 [00:00<00:10,  7.22it/s][A
  9%|▉         | 7/79 [00:00<00:10,  6.99it/s][A
 10%|█         | 8/79 [00:01<00:10,  6.85it/s][A
 11%|█▏        | 9/79 [00:01<00:10,  6.71it/s][A
 13%|█▎        | 10/79 [00:01<00:10,  6.66it/s][A
 14%|█▍        | 11/79 [00:01<00:10,  6.58it/s][A
 15%|█▌        | 12/79 [00:01<00:10,  6.51it/s][A
 16%|█▋        | 13/79 [00:01<00:10,  6.46it/s][A
 18%|█▊        | 14/79 [00:02<00:10,  6.44it/s][A
 19%|█▉        | 15/79 [00:02<00:09,  6.48it/s][A
 20%|██        | 16/79 [00:02<00:09,  6.51it/s][A
 22%|██▏       | 17/79 [00:02<00:09,  6.52it/s][A
 23%|██▎       | 18/79 [00:02<00:09,  6.57it/s][A
 24%|██▍       | 19/79 [00:02<00:09,  6.55it/s][A
 25%|██▌       | 20/79 [00:02<00:08,  6.56it/s][A
 27%|██▋       | 21/79 [00:03<00:08,  6.55it/s][A
 28%|██▊       | 22/79 [00:03<00:08, 

{'eval_loss': 0.10933887958526611, 'eval_token_accuracy': 0.9574811772758385, 'eval_exact_match_accuracy': 0.628, 'eval_runtime': 12.3554, 'eval_samples_per_second': 809.363, 'eval_steps_per_second': 6.394, 'epoch': 17.85}


 72%|███████▏  | 140000/195325 [2:21:27<6:40:15,  2.30it/s]  
  0%|          | 0/79 [00:00<?, ?it/s][A

{'loss': 0.0812, 'grad_norm': 0.900266706943512, 'learning_rate': 1.855580755232278e-05, 'epoch': 17.92}



  3%|▎         | 2/79 [00:00<00:05, 13.05it/s][A
  5%|▌         | 4/79 [00:00<00:09,  8.07it/s][A
  6%|▋         | 5/79 [00:00<00:09,  7.54it/s][A
  8%|▊         | 6/79 [00:00<00:10,  7.22it/s][A
  9%|▉         | 7/79 [00:00<00:10,  6.99it/s][A
 10%|█         | 8/79 [00:01<00:10,  6.85it/s][A
 11%|█▏        | 9/79 [00:01<00:10,  6.71it/s][A
 13%|█▎        | 10/79 [00:01<00:10,  6.65it/s][A
 14%|█▍        | 11/79 [00:01<00:10,  6.58it/s][A
 15%|█▌        | 12/79 [00:01<00:10,  6.51it/s][A
 16%|█▋        | 13/79 [00:01<00:10,  6.46it/s][A
 18%|█▊        | 14/79 [00:02<00:10,  6.44it/s][A
 19%|█▉        | 15/79 [00:02<00:09,  6.49it/s][A
 20%|██        | 16/79 [00:02<00:09,  6.52it/s][A
 22%|██▏       | 17/79 [00:02<00:09,  6.52it/s][A
 23%|██▎       | 18/79 [00:02<00:09,  6.57it/s][A
 24%|██▍       | 19/79 [00:02<00:09,  6.56it/s][A
 25%|██▌       | 20/79 [00:02<00:08,  6.57it/s][A
 27%|██▋       | 21/79 [00:03<00:08,  6.55it/s][A
 28%|██▊       | 22/79 [00:03<00:08, 

{'eval_loss': 0.09675537049770355, 'eval_token_accuracy': 0.9581382614647501, 'eval_exact_match_accuracy': 0.629, 'eval_runtime': 12.3704, 'eval_samples_per_second': 808.379, 'eval_steps_per_second': 6.386, 'epoch': 17.92}


 72%|███████▏  | 140500/195325 [2:25:28<6:36:07,  2.31it/s]  
  0%|          | 0/79 [00:00<?, ?it/s][A

{'loss': 0.0809, 'grad_norm': 0.9411319494247437, 'learning_rate': 1.824481789675117e-05, 'epoch': 17.98}



  3%|▎         | 2/79 [00:00<00:05, 13.03it/s][A
  5%|▌         | 4/79 [00:00<00:09,  8.07it/s][A
  6%|▋         | 5/79 [00:00<00:09,  7.53it/s][A
  8%|▊         | 6/79 [00:00<00:10,  7.22it/s][A
  9%|▉         | 7/79 [00:00<00:10,  6.99it/s][A
 10%|█         | 8/79 [00:01<00:10,  6.85it/s][A
 11%|█▏        | 9/79 [00:01<00:10,  6.71it/s][A
 13%|█▎        | 10/79 [00:01<00:10,  6.65it/s][A
 14%|█▍        | 11/79 [00:01<00:10,  6.58it/s][A
 15%|█▌        | 12/79 [00:01<00:10,  6.50it/s][A
 16%|█▋        | 13/79 [00:01<00:10,  6.45it/s][A
 18%|█▊        | 14/79 [00:02<00:10,  6.44it/s][A
 19%|█▉        | 15/79 [00:02<00:09,  6.48it/s][A
 20%|██        | 16/79 [00:02<00:09,  6.51it/s][A
 22%|██▏       | 17/79 [00:02<00:09,  6.52it/s][A
 23%|██▎       | 18/79 [00:02<00:09,  6.57it/s][A
 24%|██▍       | 19/79 [00:02<00:09,  6.56it/s][A
 25%|██▌       | 20/79 [00:02<00:08,  6.57it/s][A
 27%|██▋       | 21/79 [00:03<00:08,  6.55it/s][A
 28%|██▊       | 22/79 [00:03<00:08, 

{'eval_loss': 0.10176090151071548, 'eval_token_accuracy': 0.9585398129135295, 'eval_exact_match_accuracy': 0.6343, 'eval_runtime': 12.44, 'eval_samples_per_second': 803.858, 'eval_steps_per_second': 6.35, 'epoch': 17.98}


 72%|███████▏  | 141000/195325 [2:29:36<6:35:13,  2.29it/s]  
  0%|          | 0/79 [00:00<?, ?it/s][A

{'loss': 0.08, 'grad_norm': 1.1313024759292603, 'learning_rate': 1.7935256672055446e-05, 'epoch': 18.05}



  3%|▎         | 2/79 [00:00<00:05, 13.05it/s][A
  5%|▌         | 4/79 [00:00<00:09,  8.08it/s][A
  6%|▋         | 5/79 [00:00<00:09,  7.54it/s][A
  8%|▊         | 6/79 [00:00<00:10,  7.22it/s][A
  9%|▉         | 7/79 [00:00<00:10,  7.00it/s][A
 10%|█         | 8/79 [00:01<00:10,  6.85it/s][A
 11%|█▏        | 9/79 [00:01<00:10,  6.71it/s][A
 13%|█▎        | 10/79 [00:01<00:10,  6.65it/s][A
 14%|█▍        | 11/79 [00:01<00:10,  6.58it/s][A
 15%|█▌        | 12/79 [00:01<00:10,  6.51it/s][A
 16%|█▋        | 13/79 [00:01<00:10,  6.46it/s][A
 18%|█▊        | 14/79 [00:02<00:10,  6.44it/s][A
 19%|█▉        | 15/79 [00:02<00:09,  6.49it/s][A
 20%|██        | 16/79 [00:02<00:09,  6.52it/s][A
 22%|██▏       | 17/79 [00:02<00:09,  6.52it/s][A
 23%|██▎       | 18/79 [00:02<00:09,  6.57it/s][A
 24%|██▍       | 19/79 [00:02<00:09,  6.56it/s][A
 25%|██▌       | 20/79 [00:02<00:08,  6.57it/s][A
 27%|██▋       | 21/79 [00:03<00:08,  6.55it/s][A
 28%|██▊       | 22/79 [00:03<00:08, 

{'eval_loss': 0.09651194512844086, 'eval_token_accuracy': 0.9586402007757244, 'eval_exact_match_accuracy': 0.6338, 'eval_runtime': 12.3632, 'eval_samples_per_second': 808.849, 'eval_steps_per_second': 6.39, 'epoch': 18.05}


 72%|███████▏  | 141500/195325 [2:33:36<6:29:12,  2.30it/s]  
  0%|          | 0/79 [00:00<?, ?it/s][A

{'loss': 0.0798, 'grad_norm': 1.3957767486572266, 'learning_rate': 1.7627769164490366e-05, 'epoch': 18.11}



  3%|▎         | 2/79 [00:00<00:05, 13.03it/s][A
  5%|▌         | 4/79 [00:00<00:09,  8.07it/s][A
  6%|▋         | 5/79 [00:00<00:09,  7.54it/s][A
  8%|▊         | 6/79 [00:00<00:10,  7.22it/s][A
  9%|▉         | 7/79 [00:00<00:10,  6.99it/s][A
 10%|█         | 8/79 [00:01<00:10,  6.84it/s][A
 11%|█▏        | 9/79 [00:01<00:10,  6.70it/s][A
 13%|█▎        | 10/79 [00:01<00:10,  6.65it/s][A
 14%|█▍        | 11/79 [00:01<00:10,  6.58it/s][A
 15%|█▌        | 12/79 [00:01<00:10,  6.50it/s][A
 16%|█▋        | 13/79 [00:01<00:10,  6.45it/s][A
 18%|█▊        | 14/79 [00:02<00:10,  6.44it/s][A
 19%|█▉        | 15/79 [00:02<00:09,  6.48it/s][A
 20%|██        | 16/79 [00:02<00:09,  6.51it/s][A
 22%|██▏       | 17/79 [00:02<00:09,  6.52it/s][A
 23%|██▎       | 18/79 [00:02<00:09,  6.57it/s][A
 24%|██▍       | 19/79 [00:02<00:09,  6.56it/s][A
 25%|██▌       | 20/79 [00:02<00:08,  6.57it/s][A
 27%|██▋       | 21/79 [00:03<00:08,  6.55it/s][A
 28%|██▊       | 22/79 [00:03<00:08, 

{'eval_loss': 0.10779181867837906, 'eval_token_accuracy': 0.9584485512206251, 'eval_exact_match_accuracy': 0.6317, 'eval_runtime': 12.5369, 'eval_samples_per_second': 797.648, 'eval_steps_per_second': 6.301, 'epoch': 18.11}


 73%|███████▎  | 142000/195325 [2:37:37<6:28:07,  2.29it/s]  
  0%|          | 0/79 [00:00<?, ?it/s][A

{'loss': 0.08, 'grad_norm': 1.018677830696106, 'learning_rate': 1.73223752601378e-05, 'epoch': 18.17}



  3%|▎         | 2/79 [00:00<00:05, 13.05it/s][A
  5%|▌         | 4/79 [00:00<00:09,  8.07it/s][A
  6%|▋         | 5/79 [00:00<00:09,  7.54it/s][A
  8%|▊         | 6/79 [00:00<00:10,  7.23it/s][A
  9%|▉         | 7/79 [00:00<00:10,  7.00it/s][A
 10%|█         | 8/79 [00:01<00:10,  6.85it/s][A
 11%|█▏        | 9/79 [00:01<00:10,  6.71it/s][A
 13%|█▎        | 10/79 [00:01<00:10,  6.65it/s][A
 14%|█▍        | 11/79 [00:01<00:10,  6.57it/s][A
 15%|█▌        | 12/79 [00:01<00:10,  6.50it/s][A
 16%|█▋        | 13/79 [00:01<00:10,  6.45it/s][A
 18%|█▊        | 14/79 [00:02<00:10,  6.44it/s][A
 19%|█▉        | 15/79 [00:02<00:09,  6.49it/s][A
 20%|██        | 16/79 [00:02<00:09,  6.52it/s][A
 22%|██▏       | 17/79 [00:02<00:09,  6.52it/s][A
 23%|██▎       | 18/79 [00:02<00:09,  6.57it/s][A
 24%|██▍       | 19/79 [00:02<00:09,  6.56it/s][A
 25%|██▌       | 20/79 [00:02<00:08,  6.57it/s][A
 27%|██▋       | 21/79 [00:03<00:08,  6.55it/s][A
 28%|██▊       | 22/79 [00:03<00:08, 

{'eval_loss': 0.10423576831817627, 'eval_token_accuracy': 0.9580561259411362, 'eval_exact_match_accuracy': 0.6317, 'eval_runtime': 12.358, 'eval_samples_per_second': 809.191, 'eval_steps_per_second': 6.393, 'epoch': 18.17}


 73%|███████▎  | 142500/195325 [2:41:38<6:27:53,  2.27it/s]  
  0%|          | 0/79 [00:00<?, ?it/s][A

{'loss': 0.0793, 'grad_norm': 1.5543667078018188, 'learning_rate': 1.7019699148594515e-05, 'epoch': 18.24}



  3%|▎         | 2/79 [00:00<00:05, 13.05it/s][A
  5%|▌         | 4/79 [00:00<00:09,  8.07it/s][A
  6%|▋         | 5/79 [00:00<00:09,  7.54it/s][A
  8%|▊         | 6/79 [00:00<00:10,  7.22it/s][A
  9%|▉         | 7/79 [00:00<00:10,  6.99it/s][A
 10%|█         | 8/79 [00:01<00:10,  6.84it/s][A
 11%|█▏        | 9/79 [00:01<00:10,  6.71it/s][A
 13%|█▎        | 10/79 [00:01<00:10,  6.65it/s][A
 14%|█▍        | 11/79 [00:01<00:10,  6.58it/s][A
 15%|█▌        | 12/79 [00:01<00:10,  6.50it/s][A
 16%|█▋        | 13/79 [00:01<00:10,  6.45it/s][A
 18%|█▊        | 14/79 [00:02<00:10,  6.44it/s][A
 19%|█▉        | 15/79 [00:02<00:09,  6.49it/s][A
 20%|██        | 16/79 [00:02<00:09,  6.52it/s][A
 22%|██▏       | 17/79 [00:02<00:09,  6.52it/s][A
 23%|██▎       | 18/79 [00:02<00:09,  6.57it/s][A
 24%|██▍       | 19/79 [00:02<00:09,  6.56it/s][A
 25%|██▌       | 20/79 [00:02<00:08,  6.57it/s][A
 27%|██▋       | 21/79 [00:03<00:08,  6.55it/s][A
 28%|██▊       | 22/79 [00:03<00:08, 

{'eval_loss': 0.10098664462566376, 'eval_token_accuracy': 0.9583116586812686, 'eval_exact_match_accuracy': 0.6355, 'eval_runtime': 12.3632, 'eval_samples_per_second': 808.854, 'eval_steps_per_second': 6.39, 'epoch': 18.24}


 73%|███████▎  | 143000/195325 [2:45:50<6:21:00,  2.29it/s]  
  0%|          | 0/79 [00:00<?, ?it/s][A

{'loss': 0.0797, 'grad_norm': 0.3769415020942688, 'learning_rate': 1.6718547280619455e-05, 'epoch': 18.3}



  3%|▎         | 2/79 [00:00<00:05, 13.04it/s][A
  5%|▌         | 4/79 [00:00<00:09,  8.08it/s][A
  6%|▋         | 5/79 [00:00<00:09,  7.54it/s][A
  8%|▊         | 6/79 [00:00<00:10,  7.23it/s][A
  9%|▉         | 7/79 [00:00<00:10,  7.00it/s][A
 10%|█         | 8/79 [00:01<00:10,  6.84it/s][A
 11%|█▏        | 9/79 [00:01<00:10,  6.71it/s][A
 13%|█▎        | 10/79 [00:01<00:10,  6.65it/s][A
 14%|█▍        | 11/79 [00:01<00:10,  6.57it/s][A
 15%|█▌        | 12/79 [00:01<00:10,  6.50it/s][A
 16%|█▋        | 13/79 [00:01<00:10,  6.45it/s][A
 18%|█▊        | 14/79 [00:02<00:10,  6.44it/s][A
 19%|█▉        | 15/79 [00:02<00:09,  6.49it/s][A
 20%|██        | 16/79 [00:02<00:09,  6.52it/s][A
 22%|██▏       | 17/79 [00:02<00:09,  6.52it/s][A
 23%|██▎       | 18/79 [00:02<00:09,  6.57it/s][A
 24%|██▍       | 19/79 [00:02<00:09,  6.56it/s][A
 25%|██▌       | 20/79 [00:02<00:08,  6.57it/s][A
 27%|██▋       | 21/79 [00:03<00:08,  6.55it/s][A
 28%|██▊       | 22/79 [00:03<00:08, 

{'eval_loss': 0.10201061517000198, 'eval_token_accuracy': 0.9585671914214009, 'eval_exact_match_accuracy': 0.6343, 'eval_runtime': 12.361, 'eval_samples_per_second': 808.995, 'eval_steps_per_second': 6.391, 'epoch': 18.3}


 73%|███████▎  | 143500/195325 [2:49:51<6:18:10,  2.28it/s]  
  0%|          | 0/79 [00:00<?, ?it/s][A

{'loss': 0.0795, 'grad_norm': 0.6708442568778992, 'learning_rate': 1.6419547817793264e-05, 'epoch': 18.37}



  3%|▎         | 2/79 [00:00<00:05, 13.06it/s][A
  5%|▌         | 4/79 [00:00<00:09,  8.08it/s][A
  6%|▋         | 5/79 [00:00<00:09,  7.54it/s][A
  8%|▊         | 6/79 [00:00<00:10,  7.23it/s][A
  9%|▉         | 7/79 [00:00<00:10,  7.00it/s][A
 10%|█         | 8/79 [00:01<00:10,  6.85it/s][A
 11%|█▏        | 9/79 [00:01<00:10,  6.71it/s][A
 13%|█▎        | 10/79 [00:01<00:10,  6.65it/s][A
 14%|█▍        | 11/79 [00:01<00:10,  6.58it/s][A
 15%|█▌        | 12/79 [00:01<00:10,  6.51it/s][A
 16%|█▋        | 13/79 [00:01<00:10,  6.46it/s][A
 18%|█▊        | 14/79 [00:02<00:10,  6.45it/s][A
 19%|█▉        | 15/79 [00:02<00:09,  6.49it/s][A
 20%|██        | 16/79 [00:02<00:09,  6.52it/s][A
 22%|██▏       | 17/79 [00:02<00:09,  6.52it/s][A
 23%|██▎       | 18/79 [00:02<00:09,  6.57it/s][A
 24%|██▍       | 19/79 [00:02<00:09,  6.56it/s][A
 25%|██▌       | 20/79 [00:02<00:08,  6.57it/s][A
 27%|██▋       | 21/79 [00:03<00:08,  6.55it/s][A
 28%|██▊       | 22/79 [00:03<00:08, 

{'eval_loss': 0.10912749916315079, 'eval_token_accuracy': 0.9573990417522245, 'eval_exact_match_accuracy': 0.6306, 'eval_runtime': 12.3578, 'eval_samples_per_second': 809.208, 'eval_steps_per_second': 6.393, 'epoch': 18.37}


 74%|███████▎  | 144000/195325 [2:53:52<6:10:12,  2.31it/s]  
  0%|          | 0/79 [00:00<?, ?it/s][A

{'loss': 0.0789, 'grad_norm': 0.9968616962432861, 'learning_rate': 1.612331157251148e-05, 'epoch': 18.43}



  3%|▎         | 2/79 [00:00<00:05, 13.03it/s][A
  5%|▌         | 4/79 [00:00<00:09,  8.07it/s][A
  6%|▋         | 5/79 [00:00<00:09,  7.54it/s][A
  8%|▊         | 6/79 [00:00<00:10,  7.22it/s][A
  9%|▉         | 7/79 [00:00<00:10,  7.00it/s][A
 10%|█         | 8/79 [00:01<00:10,  6.85it/s][A
 11%|█▏        | 9/79 [00:01<00:10,  6.71it/s][A
 13%|█▎        | 10/79 [00:01<00:10,  6.65it/s][A
 14%|█▍        | 11/79 [00:01<00:10,  6.57it/s][A
 15%|█▌        | 12/79 [00:01<00:10,  6.50it/s][A
 16%|█▋        | 13/79 [00:01<00:10,  6.45it/s][A
 18%|█▊        | 14/79 [00:02<00:10,  6.44it/s][A
 19%|█▉        | 15/79 [00:02<00:09,  6.49it/s][A
 20%|██        | 16/79 [00:02<00:09,  6.51it/s][A
 22%|██▏       | 17/79 [00:02<00:09,  6.52it/s][A
 23%|██▎       | 18/79 [00:02<00:09,  6.57it/s][A
 24%|██▍       | 19/79 [00:02<00:09,  6.55it/s][A
 25%|██▌       | 20/79 [00:02<00:09,  6.55it/s][A
 27%|██▋       | 21/79 [00:03<00:08,  6.54it/s][A
 28%|██▊       | 22/79 [00:03<00:08, 

{'eval_loss': 0.10305017232894897, 'eval_token_accuracy': 0.9580652521104267, 'eval_exact_match_accuracy': 0.6334, 'eval_runtime': 12.3858, 'eval_samples_per_second': 807.378, 'eval_steps_per_second': 6.378, 'epoch': 18.43}


 74%|███████▍  | 144500/195325 [2:57:53<6:10:09,  2.29it/s]  
  0%|          | 0/79 [00:00<?, ?it/s][A

{'loss': 0.0783, 'grad_norm': 0.9810534119606018, 'learning_rate': 1.582867038999664e-05, 'epoch': 18.49}



  3%|▎         | 2/79 [00:00<00:05, 13.02it/s][A
  5%|▌         | 4/79 [00:00<00:09,  8.07it/s][A
  6%|▋         | 5/79 [00:00<00:09,  7.53it/s][A
  8%|▊         | 6/79 [00:00<00:10,  7.22it/s][A
  9%|▉         | 7/79 [00:00<00:10,  6.99it/s][A
 10%|█         | 8/79 [00:01<00:10,  6.85it/s][A
 11%|█▏        | 9/79 [00:01<00:10,  6.71it/s][A
 13%|█▎        | 10/79 [00:01<00:10,  6.66it/s][A
 14%|█▍        | 11/79 [00:01<00:10,  6.58it/s][A
 15%|█▌        | 12/79 [00:01<00:10,  6.50it/s][A
 16%|█▋        | 13/79 [00:01<00:10,  6.45it/s][A
 18%|█▊        | 14/79 [00:02<00:10,  6.44it/s][A
 19%|█▉        | 15/79 [00:02<00:09,  6.49it/s][A
 20%|██        | 16/79 [00:02<00:09,  6.52it/s][A
 22%|██▏       | 17/79 [00:02<00:09,  6.52it/s][A
 23%|██▎       | 18/79 [00:02<00:09,  6.57it/s][A
 24%|██▍       | 19/79 [00:02<00:09,  6.55it/s][A
 25%|██▌       | 20/79 [00:02<00:08,  6.56it/s][A
 27%|██▋       | 21/79 [00:03<00:08,  6.55it/s][A
 28%|██▊       | 22/79 [00:03<00:08, 

{'eval_loss': 0.12204990535974503, 'eval_token_accuracy': 0.9555099247091033, 'eval_exact_match_accuracy': 0.6278, 'eval_runtime': 12.3535, 'eval_samples_per_second': 809.49, 'eval_steps_per_second': 6.395, 'epoch': 18.49}


 74%|███████▍  | 145000/195325 [3:01:54<6:04:36,  2.30it/s]  
  0%|          | 0/79 [00:00<?, ?it/s][A

{'loss': 0.0784, 'grad_norm': 0.743665874004364, 'learning_rate': 1.5536239163471906e-05, 'epoch': 18.56}



  3%|▎         | 2/79 [00:00<00:05, 13.07it/s][A
  5%|▌         | 4/79 [00:00<00:09,  8.09it/s][A
  6%|▋         | 5/79 [00:00<00:09,  7.55it/s][A
  8%|▊         | 6/79 [00:00<00:10,  7.24it/s][A
  9%|▉         | 7/79 [00:00<00:10,  7.01it/s][A
 10%|█         | 8/79 [00:01<00:10,  6.86it/s][A
 11%|█▏        | 9/79 [00:01<00:10,  6.72it/s][A
 13%|█▎        | 10/79 [00:01<00:10,  6.66it/s][A
 14%|█▍        | 11/79 [00:01<00:10,  6.59it/s][A
 15%|█▌        | 12/79 [00:01<00:10,  6.52it/s][A
 16%|█▋        | 13/79 [00:01<00:10,  6.46it/s][A
 18%|█▊        | 14/79 [00:02<00:10,  6.45it/s][A
 19%|█▉        | 15/79 [00:02<00:09,  6.50it/s][A
 20%|██        | 16/79 [00:02<00:09,  6.53it/s][A
 22%|██▏       | 17/79 [00:02<00:09,  6.53it/s][A
 23%|██▎       | 18/79 [00:02<00:09,  6.58it/s][A
 24%|██▍       | 19/79 [00:02<00:09,  6.57it/s][A
 25%|██▌       | 20/79 [00:02<00:08,  6.58it/s][A
 27%|██▋       | 21/79 [00:03<00:08,  6.57it/s][A
 28%|██▊       | 22/79 [00:03<00:08, 

{'eval_loss': 0.09785623103380203, 'eval_token_accuracy': 0.9591877709331508, 'eval_exact_match_accuracy': 0.6388, 'eval_runtime': 12.3412, 'eval_samples_per_second': 810.297, 'eval_steps_per_second': 6.401, 'epoch': 18.56}


 74%|███████▍  | 145500/195325 [3:05:54<6:07:41,  2.26it/s] 
  0%|          | 0/79 [00:00<?, ?it/s][A

{'loss': 0.0773, 'grad_norm': 0.7169272303581238, 'learning_rate': 1.524661497309413e-05, 'epoch': 18.62}



  3%|▎         | 2/79 [00:00<00:05, 13.07it/s][A
  5%|▌         | 4/79 [00:00<00:09,  8.08it/s][A
  6%|▋         | 5/79 [00:00<00:09,  7.54it/s][A
  8%|▊         | 6/79 [00:00<00:10,  7.23it/s][A
  9%|▉         | 7/79 [00:00<00:10,  7.00it/s][A
 10%|█         | 8/79 [00:01<00:10,  6.86it/s][A
 11%|█▏        | 9/79 [00:01<00:10,  6.72it/s][A
 13%|█▎        | 10/79 [00:01<00:10,  6.66it/s][A
 14%|█▍        | 11/79 [00:01<00:10,  6.59it/s][A
 15%|█▌        | 12/79 [00:01<00:10,  6.51it/s][A
 16%|█▋        | 13/79 [00:01<00:10,  6.46it/s][A
 18%|█▊        | 14/79 [00:02<00:10,  6.45it/s][A
 19%|█▉        | 15/79 [00:02<00:09,  6.49it/s][A
 20%|██        | 16/79 [00:02<00:09,  6.52it/s][A
 22%|██▏       | 17/79 [00:02<00:09,  6.52it/s][A
 23%|██▎       | 18/79 [00:02<00:09,  6.57it/s][A
 24%|██▍       | 19/79 [00:02<00:09,  6.56it/s][A
 25%|██▌       | 20/79 [00:02<00:08,  6.57it/s][A
 27%|██▋       | 21/79 [00:03<00:08,  6.56it/s][A
 28%|██▊       | 22/79 [00:03<00:08, 

{'eval_loss': 0.11396870017051697, 'eval_token_accuracy': 0.956869723933379, 'eval_exact_match_accuracy': 0.6282, 'eval_runtime': 12.3632, 'eval_samples_per_second': 808.852, 'eval_steps_per_second': 6.39, 'epoch': 18.62}


 75%|███████▍  | 146000/195325 [3:09:55<5:58:54,  2.29it/s]  
  0%|          | 0/79 [00:00<?, ?it/s][A

{'loss': 0.0775, 'grad_norm': 1.6060206890106201, 'learning_rate': 1.4958655737544664e-05, 'epoch': 18.69}



  3%|▎         | 2/79 [00:00<00:05, 13.02it/s][A
  5%|▌         | 4/79 [00:00<00:09,  8.06it/s][A
  6%|▋         | 5/79 [00:00<00:09,  7.52it/s][A
  8%|▊         | 6/79 [00:00<00:10,  7.21it/s][A
  9%|▉         | 7/79 [00:00<00:10,  6.98it/s][A
 10%|█         | 8/79 [00:01<00:10,  6.83it/s][A
 11%|█▏        | 9/79 [00:01<00:10,  6.69it/s][A
 13%|█▎        | 10/79 [00:01<00:10,  6.63it/s][A
 14%|█▍        | 11/79 [00:01<00:10,  6.56it/s][A
 15%|█▌        | 12/79 [00:01<00:10,  6.48it/s][A
 16%|█▋        | 13/79 [00:01<00:10,  6.43it/s][A
 18%|█▊        | 14/79 [00:02<00:10,  6.41it/s][A
 19%|█▉        | 15/79 [00:02<00:09,  6.46it/s][A
 20%|██        | 16/79 [00:02<00:09,  6.49it/s][A
 22%|██▏       | 17/79 [00:02<00:09,  6.49it/s][A
 23%|██▎       | 18/79 [00:02<00:09,  6.54it/s][A
 24%|██▍       | 19/79 [00:02<00:09,  6.53it/s][A
 25%|██▌       | 20/79 [00:02<00:09,  6.54it/s][A
 27%|██▋       | 21/79 [00:03<00:08,  6.53it/s][A
 28%|██▊       | 22/79 [00:03<00:08, 

{'eval_loss': 0.10677149891853333, 'eval_token_accuracy': 0.9578827287246179, 'eval_exact_match_accuracy': 0.6327, 'eval_runtime': 12.3976, 'eval_samples_per_second': 806.605, 'eval_steps_per_second': 6.372, 'epoch': 18.69}


 75%|███████▌  | 146500/195325 [3:13:57<5:55:44,  2.29it/s] 
  0%|          | 0/79 [00:00<?, ?it/s][A

{'loss': 0.0777, 'grad_norm': 1.225794792175293, 'learning_rate': 1.4672962724279704e-05, 'epoch': 18.75}



  3%|▎         | 2/79 [00:00<00:05, 13.05it/s][A
  5%|▌         | 4/79 [00:00<00:09,  8.08it/s][A
  6%|▋         | 5/79 [00:00<00:09,  7.54it/s][A
  8%|▊         | 6/79 [00:00<00:10,  7.23it/s][A
  9%|▉         | 7/79 [00:00<00:10,  7.00it/s][A
 10%|█         | 8/79 [00:01<00:10,  6.85it/s][A
 11%|█▏        | 9/79 [00:01<00:10,  6.71it/s][A
 13%|█▎        | 10/79 [00:01<00:10,  6.65it/s][A
 14%|█▍        | 11/79 [00:01<00:10,  6.58it/s][A
 15%|█▌        | 12/79 [00:01<00:10,  6.51it/s][A
 16%|█▋        | 13/79 [00:01<00:10,  6.45it/s][A
 18%|█▊        | 14/79 [00:02<00:10,  6.44it/s][A
 19%|█▉        | 15/79 [00:02<00:09,  6.49it/s][A
 20%|██        | 16/79 [00:02<00:09,  6.52it/s][A
 22%|██▏       | 17/79 [00:02<00:09,  6.52it/s][A
 23%|██▎       | 18/79 [00:02<00:09,  6.55it/s][A
 24%|██▍       | 19/79 [00:02<00:09,  6.53it/s][A
 25%|██▌       | 20/79 [00:02<00:09,  6.55it/s][A
 27%|██▋       | 21/79 [00:03<00:08,  6.53it/s][A
 28%|██▊       | 22/79 [00:03<00:08, 

{'eval_loss': 0.11374931782484055, 'eval_token_accuracy': 0.9575998174766142, 'eval_exact_match_accuracy': 0.6273, 'eval_runtime': 12.364, 'eval_samples_per_second': 808.802, 'eval_steps_per_second': 6.39, 'epoch': 18.75}


 75%|███████▌  | 147000/195325 [3:17:58<5:50:50,  2.30it/s] 
  0%|          | 0/79 [00:00<?, ?it/s][A

{'loss': 0.0773, 'grad_norm': 1.2209079265594482, 'learning_rate': 1.4389554409869932e-05, 'epoch': 18.81}



  3%|▎         | 2/79 [00:00<00:05, 13.04it/s][A
  5%|▌         | 4/79 [00:00<00:09,  8.08it/s][A
  6%|▋         | 5/79 [00:00<00:09,  7.54it/s][A
  8%|▊         | 6/79 [00:00<00:10,  7.23it/s][A
  9%|▉         | 7/79 [00:00<00:10,  7.00it/s][A
 10%|█         | 8/79 [00:01<00:10,  6.85it/s][A
 11%|█▏        | 9/79 [00:01<00:10,  6.71it/s][A
 13%|█▎        | 10/79 [00:01<00:10,  6.65it/s][A
 14%|█▍        | 11/79 [00:01<00:10,  6.58it/s][A
 15%|█▌        | 12/79 [00:01<00:10,  6.51it/s][A
 16%|█▋        | 13/79 [00:01<00:10,  6.45it/s][A
 18%|█▊        | 14/79 [00:02<00:10,  6.44it/s][A
 19%|█▉        | 15/79 [00:02<00:09,  6.49it/s][A
 20%|██        | 16/79 [00:02<00:09,  6.51it/s][A
 22%|██▏       | 17/79 [00:02<00:09,  6.52it/s][A
 23%|██▎       | 18/79 [00:02<00:09,  6.57it/s][A
 24%|██▍       | 19/79 [00:02<00:09,  6.56it/s][A
 25%|██▌       | 20/79 [00:02<00:08,  6.57it/s][A
 27%|██▋       | 21/79 [00:03<00:08,  6.55it/s][A
 28%|██▊       | 22/79 [00:03<00:08, 

{'eval_loss': 0.10531479120254517, 'eval_token_accuracy': 0.9586219484371435, 'eval_exact_match_accuracy': 0.6338, 'eval_runtime': 12.3829, 'eval_samples_per_second': 807.568, 'eval_steps_per_second': 6.38, 'epoch': 18.81}


 76%|███████▌  | 147500/195325 [3:21:58<5:47:36,  2.29it/s] 
  0%|          | 0/79 [00:00<?, ?it/s][A

{'loss': 0.0769, 'grad_norm': 1.5126792192459106, 'learning_rate': 1.4108449123128115e-05, 'epoch': 18.88}



  3%|▎         | 2/79 [00:00<00:05, 13.04it/s][A
  5%|▌         | 4/79 [00:00<00:09,  8.08it/s][A
  6%|▋         | 5/79 [00:00<00:09,  7.54it/s][A
  8%|▊         | 6/79 [00:00<00:10,  7.23it/s][A
  9%|▉         | 7/79 [00:00<00:10,  7.00it/s][A
 10%|█         | 8/79 [00:01<00:10,  6.85it/s][A
 11%|█▏        | 9/79 [00:01<00:10,  6.71it/s][A
 13%|█▎        | 10/79 [00:01<00:10,  6.65it/s][A
 14%|█▍        | 11/79 [00:01<00:10,  6.58it/s][A
 15%|█▌        | 12/79 [00:01<00:10,  6.50it/s][A
 16%|█▋        | 13/79 [00:01<00:10,  6.45it/s][A
 18%|█▊        | 14/79 [00:02<00:10,  6.44it/s][A
 19%|█▉        | 15/79 [00:02<00:09,  6.49it/s][A
 20%|██        | 16/79 [00:02<00:09,  6.51it/s][A
 22%|██▏       | 17/79 [00:02<00:09,  6.52it/s][A
 23%|██▎       | 18/79 [00:02<00:09,  6.57it/s][A
 24%|██▍       | 19/79 [00:02<00:09,  6.55it/s][A
 25%|██▌       | 20/79 [00:02<00:08,  6.56it/s][A
 27%|██▋       | 21/79 [00:03<00:08,  6.54it/s][A
 28%|██▊       | 22/79 [00:03<00:08, 

{'eval_loss': 0.10346191376447678, 'eval_token_accuracy': 0.9585854437599818, 'eval_exact_match_accuracy': 0.6333, 'eval_runtime': 12.3671, 'eval_samples_per_second': 808.596, 'eval_steps_per_second': 6.388, 'epoch': 18.88}


In [None]:
s = {'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i'}
s2 = {'c', 'd', 'a', 'b', 'i', 'g', 'f', 'h', 'e'}
list1 = list(s)
list2 = list(s2)
random.shuffle(list2)  # Shuffle to ensure different order
print(list1)  # This will print the elements in the set in a specific order
print(list2)  # This will print the elements in the set in a different order

print(list1 == list2) # This will be True within the same session

['h', 'a', 'd', 'e', 'f', 'i', 'g', 'b', 'c']
['g', 'e', 'f', 'b', 'd', 'c', 'i', 'h', 'a']
False


In [None]:
# Define two sets with same elements but created from different orderings
random.seed(42)  # Ensure deterministic order
set1 = set(['a', 'b', 'c', 'd', 'e'])
set2 = set(['e', 'd', 'c', 'b', 'a'])

print("Set1:", set1)
print("Set2:", set2)

# Check if sets are equal
print("Sets equal?", set1 == set2)

# Convert sets to lists (order depends on internal hash)
list1 = list(set1)
list2 = list(set2)

print("List1:", list1)
print("List2:", list2)

# Are the lists equal (order and content)?
print("Lists equal (order and content)?", list1 == list2)


Set1: {'a', 'd', 'e', 'b', 'c'}
Set2: {'a', 'd', 'b', 'e', 'c'}
Sets equal? True
List1: ['a', 'd', 'e', 'b', 'c']
List2: ['a', 'd', 'b', 'e', 'c']
Lists equal (order and content)? False
