In [1]:
# %%
import pandas as pd
import networkx as nx
import logging
from random import shuffle
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import random
import string
from itertools import combinations
import pickle
import gc
import os
import json
from transformers import GPT2LMHeadModel, GPT2Tokenizer

#os.environ['WANDB_MODE'] = 'disabled'
# set current directory
os.chdir("/cs/student/projects1/aibh/2024/cbaumgar/MSC_THESIS")

# %%
import os
import shutil
import random
import string
import networkx as nx
from tqdm import tqdm
import numpy as np


  backends.update(_get_backends("networkx.backends"))
  from .autonotebook import tqdm as notebook_tqdm


In [2]:

# --- 1. UTILITY FUNCTIONS ---

def generate_random_names(count):
    """Generates a list of unique random 2-letter names."""
    names = set()
    while len(names) < count:
        names.add(''.join(random.choices(string.ascii_lowercase, k=2)))
    return list(names)

def get_grid_graph(nodes, size=4):
    """
    Creates a grid graph of a given size with the provided node names.
    Default size is set to 4x4.
    """
    if len(nodes) != size * size:
        raise ValueError(f"Incorrect number of nodes. Expected {size*size}, got {len(nodes)}")
    random.shuffle(nodes)
    G = nx.DiGraph()
    for r in range(size):
        for c in range(size):
            idx = r * size + c
            u = nodes[idx]
            # Add edges with directions
            if c < size - 1: G.add_edge(u, nodes[idx + 1], direction='EAST')
            if c > 0: G.add_edge(u, nodes[idx - 1], direction='WEST')
            if r < size - 1: G.add_edge(u, nodes[idx + size], direction='SOUTH')
            if r > 0: G.add_edge(u, nodes[idx - size], direction='NORTH')
    return G

def generate_single_random_walk(G, min_walk_length, max_walk_length):
    """
    Generates one random walk on the graph with a length randomly chosen
    from the specified range [min_walk_length, max_walk_length].
    """
    nodes = list(G.nodes)
    random.shuffle(nodes)
    if not nodes:
        return []
    
    # *** KEY CHANGE: Choose a random length for this specific walk ***
    walk_length = random.randint(min_walk_length, max_walk_length)
        
    walk = []
    current_node = random.choice(nodes)
    walk.append(current_node)
    
    while len(walk) < walk_length:
        neighbors = list(G.successors(current_node))
        if not neighbors:
            break  # End walk if at a dead end
        next_node = random.choice(neighbors)
        walk.append(next_node)
        current_node = next_node
        
    return walk

def walk_to_string(walk, G):
    """Converts a list of nodes into a 'NODE DIR NODE...' string."""
    if not walk or len(walk) < 2:
        return "" # Return empty string for trivial walks
        
    path_segments = []
    for i in range(len(walk) - 1):
        node1 = walk[i]
        node2 = walk[i+1]
        try:
            direction = G.edges[node1, node2]['direction']
            path_segments.append(f"{node1} {direction}")
        except KeyError:
            print(f"Warning: Edge ({node1}, {node2}) missing from graph.")
            continue

    return " ".join(path_segments) + " " + walk[-1]

# --- 2. CORE DATA GENERATION LOGIC ---

def generate_and_save_passive_walks(num_walks, min_walk_length, max_walk_length, grid_size, output_file_path, description=""):
    """
    Generates and saves a dataset of random walks for passive learning.
    For each walk, a new graph with new random node names is created.
    """
    print(f"--- Generating {num_walks} walks for {description} ---")
    
    num_nodes = grid_size * grid_size
    
    with open(output_file_path, "w", encoding="utf-8") as f:
        for _ in tqdm(range(num_walks), desc=f"Generating {description}"):
            # 1. Create a new graph with fresh random names for each walk
            node_names = generate_random_names(num_nodes)
            G = get_grid_graph(node_names, size=grid_size)
            
            # 2. Generate a single random walk on this new graph with variable length
            walk_nodes = generate_single_random_walk(G, min_walk_length, max_walk_length)
            
            # 3. Convert the walk to the required string format
            walk_str = walk_to_string(walk_nodes, G)
            
            # 4. Write to file if it's a valid, non-empty walk
            if walk_str:
                f.write(walk_str + "\n")

    print(f"Successfully saved {num_walks} walks to {output_file_path}")


In [3]:

# # %%
# # --- 3. MAIN EXECUTION BLOCK ---

# if __name__ == "__main__":
#     # --- Configuration ---
#     NUM_TRAIN_WALKS = 1000000
#     NUM_TEST_WALKS = 10000
    
#     # *** KEY CHANGE: Define min/max walk lengths and set grid size to 4 ***
#     MIN_WALK_LENGTH = 120
#     MAX_WALK_LENGTH = 120
#     GRID_SIZE = 4 # Sticking to 4x4 grids as requested

#     # Set a seed for reproducibility
#     random.seed(42)
#     np.random.seed(42)

#     # --- Setup Output Directory ---
#     output_dir = "fv1_data" # More descriptive name
#     if os.path.exists(output_dir):
#         print(f"Removing existing directory: {output_dir}")
#         shutil.rmtree(output_dir)
#     print(f"Creating new directory: {output_dir}")
#     os.makedirs(output_dir)

#     # --- Generate Training Data ---
#     train_file_path = os.path.join(output_dir, "train.txt")
#     generate_and_save_passive_walks(
#         num_walks=NUM_TRAIN_WALKS,
#         min_walk_length=MIN_WALK_LENGTH,
#         max_walk_length=MAX_WALK_LENGTH,
#         grid_size=GRID_SIZE,
#         output_file_path=train_file_path,
#         description="Training Set"
#     )

#     # --- Generate Testing Data ---
#     test_file_path = os.path.join(output_dir, "test.txt")
#     generate_and_save_passive_walks(
#         num_walks=NUM_TEST_WALKS,
#         min_walk_length=MIN_WALK_LENGTH,
#         max_walk_length=MAX_WALK_LENGTH,
#         grid_size=GRID_SIZE,
#         output_file_path=test_file_path,
#         description="Testing Set"
#     )

#     print(f"\n--- Passive walk data generation finished successfully! ---")
#     print(f"Data saved in the '{output_dir}' directory.")



In [4]:
# import os

# # --- Configuration ---
# SOURCE_DIR = "fv1_data"
# TARGET_DIR = "fv1_data_with_eos"
# FILES_TO_PROCESS = ["train.txt", "test.txt"]
# EOS_TOKEN = "[EOS]"

# # --- Create the new directory ---
# os.makedirs(TARGET_DIR, exist_ok=True)
# print(f"Created directory: {TARGET_DIR}")

# for filename in FILES_TO_PROCESS:
#     source_path = os.path.join(SOURCE_DIR, filename)
#     target_path = os.path.join(TARGET_DIR, filename)
    
#     print(f"Processing {source_path} -> {target_path}...")
    
#     with open(source_path, 'r', encoding='utf-8') as f_in, \
#          open(target_path, 'w', encoding='utf-8') as f_out:
        
#         for line in f_in:
#             # Strip the original newline and append the EOS token and a new newline
#             stripped_line = line.strip()
#             if stripped_line: # Ensure we don't just write EOS for empty lines
#                 f_out.write(f"{stripped_line} {EOS_TOKEN}\n")

# print("\nProcessing complete.")
# print(f"New data files with EOS tokens are saved in '{TARGET_DIR}'.")

In [5]:
# %%
class GPT:

    def __init__(self, base_model=None, base_model_name='gpt2', vocab_size=100):
        self.base_model = base_model
        self.base_model_name = base_model_name
        self.vocab_size = vocab_size

        if self.base_model is not None:
            self.tokenizer = GPT2Tokenizer.from_pretrained(base_model)
            self.model = GPT2LMHeadModel.from_pretrained(base_model)
            self.tokenizer.pad_token = self.tokenizer.eos_token

    def continue_input(self, input_sequence, max_new_tokens=5, num_return_sequences=1, no_repeat_ngram_size=0,
                       do_sample=False, temperature=0.7, num_beams=1):
        input_ids = self.tokenizer.encode(input_sequence, return_tensors='pt')

        # Generate text
        output = self.model.generate(
            input_ids,
            max_new_tokens=max_new_tokens,
            num_return_sequences=num_return_sequences,
            num_beams=num_beams,
            no_repeat_ngram_size=no_repeat_ngram_size,
            do_sample=do_sample,
            temperature=temperature,
        )

        # Decode the output
        sequence = output[0].tolist()
        text = self.tokenizer.decode(sequence)
        return text

# %%
def load_pkl(pth):
    with open(pth, 'rb') as f:
        d = pickle.load(f)
    return d

def is_valid_path(sequence, graphs):
    # Split the sequence into parts
    parts = sequence.split()

    # Extract nodes and edges; nodes are at even indices, edges at odd indices
    nodes = parts[::2]
    edges = parts[1::2]

    # Convert edges to a lowercase version for comparison (assuming all edges in graphs are lowercase)
    edges = [edge.lower() for edge in edges]

    # Iterate over each graph to check if the path exists
    for graph in graphs:
        path_exists = True
        for i in range(len(nodes) - 1):
            # Check if the current graph has the edge between the current node and the next node
            if not graph.has_edge(nodes[i], nodes[i+1]):
                path_exists = False
                break

        # If path exists in the current graph, return True
        if path_exists:
            return True

    # If none of the graphs contain the path, return False
    return False


In [6]:
# from transformers import GPT2Tokenizer
# import os

# TOKENIZER_PATH = "foraging_tokenizer" # Assuming this path contains your tokenizer files

# print(f"Loading custom tokenizer from: {TOKENIZER_PATH}")
# my_custom_tokenizer = GPT2Tokenizer.from_pretrained(TOKENIZER_PATH)

# print("\n--- Verification ---")

# # Case 1: Word with a leading space (as if it were a normal word)
# text_with_space = "Go NORTH"
# encoded_with_space = my_custom_tokenizer.encode(text_with_space, add_special_tokens=False)
# decoded_with_space = my_custom_tokenizer.decode(encoded_with_space)
# tokens_with_space = [my_custom_tokenizer.decode([t]) for t in encoded_with_space]

# print(f"\nOriginal text: '{text_with_space}'")
# print(f"Encoded IDs: {encoded_with_space}")
# print(f"Decoded back: '{decoded_with_space}'")
# print(f"Individual tokens: {tokens_with_space}")


# # Case 2: Word without a leading space (start of sentence or after punctuation)
# text_no_space = "NORTH pole"
# encoded_no_space = my_custom_tokenizer.encode(text_no_space, add_special_tokens=False)
# decoded_no_space = my_custom_tokenizer.decode(encoded_no_space)
# tokens_no_space = [my_custom_tokenizer.decode([t]) for t in encoded_no_space]

# print(f"\nOriginal text: '{text_no_space}'")
# print(f"Encoded IDs: {encoded_no_space}")
# print(f"Decoded back: '{decoded_no_space}'")
# print(f"Individual tokens: {tokens_no_space}")

# # Case 3: A word that is part of a larger word
# text_part_of_word = "NORTHERN"
# encoded_part_of_word = my_custom_tokenizer.encode(text_part_of_word, add_special_tokens=False)
# decoded_part_of_word = my_custom_tokenizer.decode(encoded_part_of_word)
# tokens_part_of_word = [my_custom_tokenizer.decode([t]) for t in encoded_part_of_word]

# print(f"\nOriginal text: '{text_part_of_word}'")
# print(f"Encoded IDs: {encoded_part_of_word}")
# print(f"Decoded back: '{decoded_part_of_word}'")
# print(f"Individual tokens: {tokens_part_of_word}")


# # Inspecting the vocabulary mapping for "NORTH"
# print(f"\nID for 'NORTH': {my_custom_tokenizer.get_vocab().get('NORTH')}")
# print(f"ID for 'ĠNORTH': {my_custom_tokenizer.get_vocab().get('ĠNORTH')}") # Should be None or a different ID if 'ĠNORTH' was in vocab

In [7]:

# # %%
# from transformers import AutoConfig, GPT2Tokenizer
# import os

# # --- Configuration ---
# TOKENIZER_PATH = "foraging_tokenizer"
# CONFIG_SAVE_PATH = "foraging_chunked_config"

# # 1. Load the custom tokenizer to get its properties
# print(f"Loading custom tokenizer from: {TOKENIZER_PATH}")
# my_custom_tokenizer = GPT2Tokenizer.from_pretrained(TOKENIZER_PATH)

# # 2. Load the base gpt2 config as a blueprint
# print("Loading base gpt2 config...")
# config = AutoConfig.from_pretrained('gpt2')

# # 3. CRITICAL: Override the config with our custom tokenizer's properties
# # --- IMPROVEMENT: Use len() for the most reliable vocab size ---
# print(f"Overriding config vocab_size. Old: {config.vocab_size}, New: {len(my_custom_tokenizer)}")
# config.vocab_size = len(my_custom_tokenizer)

# # Also update the special token IDs
# print("Updating special token IDs in the config...")
# config.bos_token_id = my_custom_tokenizer.bos_token_id
# config.eos_token_id = my_custom_tokenizer.eos_token_id

# # --- IMPROVEMENT: Explicitly set the pad_token_id ---
# # This ensures full compatibility with the Trainer and DataCollator.
# config.pad_token_id = my_custom_tokenizer.pad_token_id
# print(f"Set pad_token_id to: {config.pad_token_id}")
# # ----------------------------------------------------

# # 4. Save the new, modified config to its own directory
# os.makedirs(CONFIG_SAVE_PATH, exist_ok=True)
# config.save_pretrained(CONFIG_SAVE_PATH)

# print(f"\nCustom model config saved successfully to: {CONFIG_SAVE_PATH}")


In [12]:
from transformers import GPT2Tokenizer

# --- Load your custom tokenizer ---
TOKENIZER_PATH = "foraging_tokenizer"
tokenizer = GPT2Tokenizer.from_pretrained(TOKENIZER_PATH)

# --- Test input sequence ---
test_text = "eb EAST lt NORTH yz NORTH mb NORTH jf SOUTH mb WEST rr NORTH he SOUTH rr SOUTH no NORTH rr [START_NODE] jf [GOAL] eb [PLAN] jf SOUTH mb SOUTH yz SOUTH lt WEST eb [EOS]"

# --- Encode the sequence ---
encoded = tokenizer.encode(test_text, add_special_tokens=True)
print("Encoded IDs:", encoded)

# --- Decode back to text ---
decoded = tokenizer.decode(encoded)
print("Decoded text:", decoded)

# --- Extra: view tokens step by step ---
tokens = tokenizer.convert_ids_to_tokens(encoded)
print("Tokenized sequence:", tokens)


Encoded IDs: [1638, 282, 420, 276, 484, 276, 488, 276, 768, 272, 488, 279, 849, 276, 637, 272, 849, 272, 470, 276, 849, 231, 8, 768, 231, 9, 497, 231, 5, 768, 272, 488, 272, 484, 272, 420, 279, 497, 231, 2]
Decoded text: eb EAST lt NORTH yz NORTH mb NORTH jf SOUTH mb WEST rr NORTH he SOUTH rr SOUTH no NORTH rr [START_NODE] jf [GOAL] eb [PLAN] jf SOUTH mb SOUTH yz SOUTH lt WEST eb [EOS]
Tokenized sequence: ['eb', 'ĠEAST', 'Ġlt', 'ĠNORTH', 'Ġyz', 'ĠNORTH', 'Ġmb', 'ĠNORTH', 'Ġjf', 'ĠSOUTH', 'Ġmb', 'ĠWEST', 'Ġrr', 'ĠNORTH', 'Ġhe', 'ĠSOUTH', 'Ġrr', 'ĠSOUTH', 'Ġno', 'ĠNORTH', 'Ġrr', 'Ġ', '[START_NODE]', 'Ġjf', 'Ġ', '[GOAL]', 'Ġeb', 'Ġ', '[PLAN]', 'Ġjf', 'ĠSOUTH', 'Ġmb', 'ĠSOUTH', 'Ġyz', 'ĠSOUTH', 'Ġlt', 'ĠWEST', 'Ġeb', 'Ġ', '[EOS]']


In [8]:

# %%
import os

# Define the path for new cache directory 
project_cache_path = os.path.join(os.getcwd(), '.cache', 'huggingface_datasets')
os.makedirs(project_cache_path, exist_ok=True)

# Set the environment variable to tell the 'datasets' library to use this new path
os.environ['HF_DATASETS_CACHE'] = project_cache_path

print(f"Hugging Face datasets cache is now set to: {os.environ['HF_DATASETS_CACHE']}")

wandb_path = os.path.join(os.getcwd(), 'wandb_local_runs')
os.makedirs(wandb_path, exist_ok=True)
# Set the environment variable to tell 'wandb' to use this new path
os.environ['WANDB_DIR'] = wandb_path
print(f"Wandb local run directory set to: {os.environ['WANDB_DIR']}")

import os
os.environ["WANDB__SERVICE_WAIT"] = "120"


Hugging Face datasets cache is now set to: /cs/student/projects1/aibh/2024/cbaumgar/MSC_THESIS/.cache/huggingface_datasets
Wandb local run directory set to: /cs/student/projects1/aibh/2024/cbaumgar/MSC_THESIS/wandb_local_runs


In [9]:

# %%
def train_model_script(num_epochs=1,
                       output_dir='fv1_model',
                       lr=5e-05, wandb_run_id=None):
    gc.collect()
    try:
        wandb.login()
    except Exception as e:
        print(f"Could not log in to wandb: {e}")

    if wandb_run_id:
        print(f"Attempting to resume W&B run with ID: {wandb_run_id}")
        os.environ['WANDB_RESUME'] = "allow"
        os.environ['WANDB_RUN_ID'] = wandb_run_id

    os.environ['WANDB_PROJECT'] = 'foraging_models'

    # --- Hyperparameters ---
    save_steps = 500
    save_total_limit = 20
    weight_decay = 0.1
    beta2 = 0.98
    

    # --- Define Clean Paths ---
    data_dir = 'fv1_data'
    tokenizer_dir = 'foraging_tokenizer'
    config_dir = 'foraging_chunked_config'

    train_path = f'./{data_dir}/train.txt'
    test_path = f'./{data_dir}/test.txt'
    
    training_script_path = './run_clm.py'

    command = f"""
    python3 {training_script_path} \\
        --config_name {config_dir} \\
        --tokenizer_name {tokenizer_dir} \\
        --train_file {train_path} \\
        --validation_file {test_path} \\
        --do_train \\
        --do_eval \\
        --output_dir {output_dir} \\
        --per_device_train_batch_size 16 \\
        --per_device_eval_batch_size 16 \\
        --num_train_epochs {num_epochs} \\
        --save_strategy 'steps' \\
        --save_steps {save_steps} \\
        --save_total_limit {save_total_limit} \\
        --eval_strategy 'steps' \\
        --eval_steps 500 \\
        --warmup_steps 0 \\
        --learning_rate {lr} \\
        --report_to 'wandb' \\
        --lr_scheduler_type cosine \\
        --weight_decay {weight_decay} \\
        --adam_beta2 {beta2} \\
        --dataloader_num_workers 0 \\
        --overwrite_cache \
    """

    print("--- Running Command ---")
    print(command)
    print("-----------------------")

    os.system(command)
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"



In [10]:
#!rm -rf fv1_model/
!mkdir fv1_model_chunked/

mkdir: cannot create directory ‘fv1_model_chunked/’: File exists


In [11]:
train_model_script(num_epochs=8,
                   output_dir='fv1_model_chunked',
                   lr=1e-04, wandb_run_id="ci75uvds")  # Replace None with actual run ID to resume


Could not log in to wandb: name 'wandb' is not defined
Attempting to resume W&B run with ID: ci75uvds
--- Running Command ---

    python3 ./run_clm.py \
        --config_name foraging_chunked_config \
        --tokenizer_name foraging_tokenizer \
        --train_file ./fv1_data/train.txt \
        --validation_file ./fv1_data/test.txt \
        --do_train \
        --do_eval \
        --output_dir fv1_model_chunked \
        --per_device_train_batch_size 16 \
        --per_device_eval_batch_size 16 \
        --num_train_epochs 8 \
        --save_strategy 'steps' \
        --save_steps 500 \
        --save_total_limit 20 \
        --eval_strategy 'steps' \
        --eval_steps 500 \
        --warmup_steps 0 \
        --learning_rate 0.0001 \
        --report_to 'wandb' \
        --lr_scheduler_type cosine \
        --weight_decay 0.1 \
        --adam_beta2 0.98 \
        --dataloader_num_workers 0 \
        --overwrite_cache     
-----------------------
09/05/2025 03:38:11 - INFO - __m

Using custom data configuration default-d05b448f19c4b626
Found cached dataset text (/cs/student/projects1/aibh/2024/cbaumgar/MSC_THESIS/.cache/huggingface_datasets/text/default-d05b448f19c4b626/0.0.0/37eaf37ac90527a7fd768c94b312ee84f8815c9b7ac00acf81c1c364e8392f99)
[INFO|configuration_utils.py:763] 2025-09-05 03:38:12,280 >> loading configuration file foraging_chunked_config/config.json
[INFO|configuration_utils.py:839] 2025-09-05 03:38:12,281 >> Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 3,
  "embd_pdrop": 0.1,
  "eos_token_id": 2,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "pad_token_id": 0,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summa

09/05/2025 03:38:12 - INFO - datasets.builder - Using custom data configuration default-d05b448f19c4b626
09/05/2025 03:38:12 - INFO - datasets.builder - Found cached dataset text (/cs/student/projects1/aibh/2024/cbaumgar/MSC_THESIS/.cache/huggingface_datasets/text/default-d05b448f19c4b626/0.0.0/37eaf37ac90527a7fd768c94b312ee84f8815c9b7ac00acf81c1c364e8392f99)
09/05/2025 03:38:12 - INFO - __main__ - Training new model from scratch - Total size=83.08M params
09/05/2025 03:38:12 - INFO - datasets.arrow_dataset - Caching processed dataset at /cs/student/projects1/aibh/2024/cbaumgar/MSC_THESIS/.cache/huggingface_datasets/text/default-d05b448f19c4b626/0.0.0/37eaf37ac90527a7fd768c94b312ee84f8815c9b7ac00acf81c1c364e8392f99/cache-0a0d076d97ff9d63.arrow


Running tokenizer on dataset: 100%|██████████| 1000000/1000000 [00:00<?, ? examples/s]Caching processed dataset at /cs/student/projects1/aibh/2024/cbaumgar/MSC_THESIS/.cache/huggingface_datasets/text/default-d05b448f19c4b626/0.0.0/37eaf37ac90527a7fd768c94b312ee84f8815c9b7ac00acf81c1c364e8392f99/cache-0a0d076d97ff9d63.arrow
Running tokenizer on dataset: 2000000 examples [01:21, 12298.53 examples/s]           
Running tokenizer on dataset: 100%|██████████| 10000/10000 [00:00<?, ? examples/s]Caching processed dataset at /cs/student/projects1/aibh/2024/cbaumgar/MSC_THESIS/.cache/huggingface_datasets/text/default-d05b448f19c4b626/0.0.0/37eaf37ac90527a7fd768c94b312ee84f8815c9b7ac00acf81c1c364e8392f99/cache-51fd396e9f0dc66b.arrow
Running tokenizer on dataset: 12000 examples [00:00, 12873.03 examples/s]         

09/05/2025 03:39:34 - INFO - datasets.arrow_dataset - Caching processed dataset at /cs/student/projects1/aibh/2024/cbaumgar/MSC_THESIS/.cache/huggingface_datasets/text/default-d05b448f19c4b626/0.0.0/37eaf37ac90527a7fd768c94b312ee84f8815c9b7ac00acf81c1c364e8392f99/cache-51fd396e9f0dc66b.arrow


Running tokenizer on dataset: 20000 examples [00:00, 12571.42 examples/s]
Grouping texts in chunks of 1024: 100%|██████████| 1000000/1000000 [00:00<?, ? examples/s]Caching processed dataset at /cs/student/projects1/aibh/2024/cbaumgar/MSC_THESIS/.cache/huggingface_datasets/text/default-d05b448f19c4b626/0.0.0/37eaf37ac90527a7fd768c94b312ee84f8815c9b7ac00acf81c1c364e8392f99/cache-7d1999f99b9e3243.arrow
Grouping texts in chunks of 1024: 1002000 examples [00:00, 7908.95 examples/s]            

09/05/2025 03:39:35 - INFO - datasets.arrow_dataset - Caching processed dataset at /cs/student/projects1/aibh/2024/cbaumgar/MSC_THESIS/.cache/huggingface_datasets/text/default-d05b448f19c4b626/0.0.0/37eaf37ac90527a7fd768c94b312ee84f8815c9b7ac00acf81c1c364e8392f99/cache-7d1999f99b9e3243.arrow


Grouping texts in chunks of 1024: 2000000 examples [02:40, 6224.19 examples/s]
Grouping texts in chunks of 1024: 100%|██████████| 10000/10000 [00:00<?, ? examples/s]Caching processed dataset at /cs/student/projects1/aibh/2024/cbaumgar/MSC_THESIS/.cache/huggingface_datasets/text/default-d05b448f19c4b626/0.0.0/37eaf37ac90527a7fd768c94b312ee84f8815c9b7ac00acf81c1c364e8392f99/cache-3ed4bd1e58a51233.arrow
Grouping texts in chunks of 1024: 11000 examples [00:00, 6996.55 examples/s]          

09/05/2025 03:42:15 - INFO - datasets.arrow_dataset - Caching processed dataset at /cs/student/projects1/aibh/2024/cbaumgar/MSC_THESIS/.cache/huggingface_datasets/text/default-d05b448f19c4b626/0.0.0/37eaf37ac90527a7fd768c94b312ee84f8815c9b7ac00acf81c1c364e8392f99/cache-3ed4bd1e58a51233.arrow


Grouping texts in chunks of 1024: 20000 examples [00:01, 5338.73 examples/s]
  trainer = Trainer(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[INFO|trainer.py:2940] 2025-09-05 03:42:19,552 >> Loading model from fv1_model_chunked/checkpoint-83500.
[INFO|trainer.py:2523] 2025-09-05 03:42:35,662 >> ***** Running training *****
[INFO|trainer.py:2524] 2025-09-05 03:42:35,662 >>   Num examples = 234,000
[INFO|trainer.py:2525] 2025-09-05 03:42:35,662 >>   Num Epochs = 8
[INFO|trai

{'loss': 1.0602, 'grad_norm': 0.19294366240501404, 'learning_rate': 1.8378771186014203e-05, 'epoch': 5.74}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.27it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.56it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.69it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.28it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.05it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.91it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.82it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.77it/s][A
  7%|▋         | 10/147 [00:01<00:28,  4.73it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.70it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.70it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.68it/s][A
 10%|▉         | 14/147 [00:02<00:30,  4.38it/s][A
 10%|█         | 15/147 [00:03<00:29,  4.46it/s][A
 11%|█         | 16/147 [00:03<00:29,  4.51it/s][A
 12%|█▏        | 17/147 [00:03<00:28,  4.55it/s][A
 12%|█▏        | 18/147 [00:03<00:28,  4.58it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.60it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.0607483386993408, 'eval_accuracy': 0.5937242566274824, 'eval_runtime': 35.4266, 'eval_samples_per_second': 66.052, 'eval_steps_per_second': 4.149, 'epoch': 5.74}


[INFO|modeling_utils.py:4297] 2025-09-05 03:52:08,935 >> Model weights saved in fv1_model_chunked/checkpoint-84000/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 03:52:08,940 >> tokenizer config file saved in fv1_model_chunked/checkpoint-84000/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 03:52:08,942 >> Special tokens file saved in fv1_model_chunked/checkpoint-84000/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 03:52:15,470 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-74500] due to args.save_total_limit
 72%|███████▏  | 84500/117000 [16:54<7:38:38,  1.18it/s][INFO|trainer.py:4623] 2025-09-05 03:59:31,975 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 03:59:31,975 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 03:59:31,975 >>   Batch size = 16


{'loss': 1.0604, 'grad_norm': 0.1911117136478424, 'learning_rate': 1.7861647988624286e-05, 'epoch': 5.78}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.30it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.55it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.68it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.27it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.04it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.90it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.82it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.77it/s][A
  7%|▋         | 10/147 [00:01<00:28,  4.73it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.70it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.69it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.68it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.67it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.66it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.66it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.66it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.65it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.65it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.0607481002807617, 'eval_accuracy': 0.5936966856321695, 'eval_runtime': 35.0257, 'eval_samples_per_second': 66.808, 'eval_steps_per_second': 4.197, 'epoch': 5.78}


[INFO|modeling_utils.py:4297] 2025-09-05 04:00:10,071 >> Model weights saved in fv1_model_chunked/checkpoint-84500/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 04:00:10,076 >> tokenizer config file saved in fv1_model_chunked/checkpoint-84500/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 04:00:10,078 >> Special tokens file saved in fv1_model_chunked/checkpoint-84500/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 04:00:16,621 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-75000] due to args.save_total_limit
 73%|███████▎  | 85000/117000 [24:13<6:46:05,  1.31it/s][INFO|trainer.py:4623] 2025-09-05 04:06:51,035 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 04:06:51,035 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 04:06:51,036 >>   Batch size = 16


{'loss': 1.0604, 'grad_norm': 0.2152513563632965, 'learning_rate': 1.7350317545563793e-05, 'epoch': 5.81}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.30it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.56it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.69it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.28it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.05it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.91it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.82it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.77it/s][A
  7%|▋         | 10/147 [00:01<00:28,  4.73it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.71it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.69it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.69it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.67it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.66it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.66it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.65it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.65it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.65it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.0606658458709717, 'eval_accuracy': 0.5937710437710437, 'eval_runtime': 34.9981, 'eval_samples_per_second': 66.861, 'eval_steps_per_second': 4.2, 'epoch': 5.81}


[INFO|modeling_utils.py:4297] 2025-09-05 04:07:29,106 >> Model weights saved in fv1_model_chunked/checkpoint-85000/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 04:07:29,111 >> tokenizer config file saved in fv1_model_chunked/checkpoint-85000/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 04:07:29,114 >> Special tokens file saved in fv1_model_chunked/checkpoint-85000/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 04:07:35,437 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-75500] due to args.save_total_limit
 73%|███████▎  | 85500/117000 [31:11<6:23:33,  1.37it/s][INFO|trainer.py:4623] 2025-09-05 04:13:49,122 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 04:13:49,122 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 04:13:49,122 >>   Batch size = 16


{'loss': 1.0603, 'grad_norm': 0.26194801926612854, 'learning_rate': 1.6844872021223636e-05, 'epoch': 5.85}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.29it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.55it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.68it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.27it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.05it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.91it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.82it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.76it/s][A
  7%|▋         | 10/147 [00:01<00:28,  4.73it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.70it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.69it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.68it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.67it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.66it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.66it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.65it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.65it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.65it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.0607097148895264, 'eval_accuracy': 0.5936365307333049, 'eval_runtime': 35.2306, 'eval_samples_per_second': 66.419, 'eval_steps_per_second': 4.173, 'epoch': 5.85}


[INFO|modeling_utils.py:4297] 2025-09-05 04:14:27,429 >> Model weights saved in fv1_model_chunked/checkpoint-85500/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 04:14:27,432 >> tokenizer config file saved in fv1_model_chunked/checkpoint-85500/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 04:14:27,434 >> Special tokens file saved in fv1_model_chunked/checkpoint-85500/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 04:14:33,981 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-76000] due to args.save_total_limit
 74%|███████▎  | 86000/117000 [37:59<6:06:15,  1.41it/s][INFO|trainer.py:4623] 2025-09-05 04:20:36,910 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 04:20:36,910 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 04:20:36,910 >>   Batch size = 16


{'loss': 1.0601, 'grad_norm': 0.1722496896982193, 'learning_rate': 1.6345402519271773e-05, 'epoch': 5.88}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.30it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.56it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.68it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.28it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.05it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.91it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.82it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.77it/s][A
  7%|▋         | 10/147 [00:01<00:28,  4.73it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.70it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.69it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.67it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.67it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.66it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.65it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.65it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.66it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.65it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.0606669187545776, 'eval_accuracy': 0.5937618534392728, 'eval_runtime': 34.962, 'eval_samples_per_second': 66.93, 'eval_steps_per_second': 4.205, 'epoch': 5.88}


[INFO|modeling_utils.py:4297] 2025-09-05 04:21:14,942 >> Model weights saved in fv1_model_chunked/checkpoint-86000/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 04:21:14,947 >> tokenizer config file saved in fv1_model_chunked/checkpoint-86000/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 04:21:14,950 >> Special tokens file saved in fv1_model_chunked/checkpoint-86000/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 04:21:21,205 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-76500] due to args.save_total_limit
 74%|███████▍  | 86500/117000 [44:40<6:15:05,  1.36it/s][INFO|trainer.py:4623] 2025-09-05 04:27:17,745 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 04:27:17,745 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 04:27:17,745 >>   Batch size = 16


{'loss': 1.0607, 'grad_norm': 0.29304489493370056, 'learning_rate': 1.585199906623221e-05, 'epoch': 5.91}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.29it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.56it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.68it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.27it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.04it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.91it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.83it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.77it/s][A
  7%|▋         | 10/147 [00:01<00:28,  4.73it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.70it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.68it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.67it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.66it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.66it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.66it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.65it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.65it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.65it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.0606942176818848, 'eval_accuracy': 0.5940375633924021, 'eval_runtime': 34.9589, 'eval_samples_per_second': 66.936, 'eval_steps_per_second': 4.205, 'epoch': 5.91}


[INFO|modeling_utils.py:4297] 2025-09-05 04:27:55,783 >> Model weights saved in fv1_model_chunked/checkpoint-86500/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 04:27:55,787 >> tokenizer config file saved in fv1_model_chunked/checkpoint-86500/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 04:27:55,789 >> Special tokens file saved in fv1_model_chunked/checkpoint-86500/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 04:28:02,318 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-77000] due to args.save_total_limit
 74%|███████▍  | 87000/117000 [51:17<5:56:46,  1.40it/s][INFO|trainer.py:4623] 2025-09-05 04:33:55,592 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 04:33:55,592 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 04:33:55,592 >>   Batch size = 16


{'loss': 1.0604, 'grad_norm': 0.29293352365493774, 'learning_rate': 1.536475059525834e-05, 'epoch': 5.95}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.30it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.56it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.68it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.28it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.05it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.91it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.82it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.77it/s][A
  7%|▋         | 10/147 [00:01<00:28,  4.74it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.71it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.69it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.68it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.67it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.66it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.65it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.65it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.65it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.65it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.0606369972229004, 'eval_accuracy': 0.5936336065368324, 'eval_runtime': 34.9388, 'eval_samples_per_second': 66.974, 'eval_steps_per_second': 4.207, 'epoch': 5.95}


[INFO|modeling_utils.py:4297] 2025-09-05 04:34:33,599 >> Model weights saved in fv1_model_chunked/checkpoint-87000/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 04:34:33,605 >> tokenizer config file saved in fv1_model_chunked/checkpoint-87000/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 04:34:33,608 >> Special tokens file saved in fv1_model_chunked/checkpoint-87000/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 04:34:40,124 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-77500] due to args.save_total_limit
 75%|███████▍  | 87500/117000 [57:53<5:46:06,  1.42it/s][INFO|trainer.py:4623] 2025-09-05 04:40:30,938 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 04:40:30,938 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 04:40:30,938 >>   Batch size = 16


{'loss': 1.0596, 'grad_norm': 0.2521927058696747, 'learning_rate': 1.4883744930103194e-05, 'epoch': 5.98}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.30it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.56it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.68it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.27it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.06it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.92it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.83it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.77it/s][A
  7%|▋         | 10/147 [00:01<00:28,  4.73it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.71it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.69it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.68it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.67it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.66it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.66it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.66it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.65it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.66it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.0606114864349365, 'eval_accuracy': 0.5937146485533582, 'eval_runtime': 34.9233, 'eval_samples_per_second': 67.004, 'eval_steps_per_second': 4.209, 'epoch': 5.98}


[INFO|modeling_utils.py:4297] 2025-09-05 04:41:08,939 >> Model weights saved in fv1_model_chunked/checkpoint-87500/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 04:41:08,943 >> tokenizer config file saved in fv1_model_chunked/checkpoint-87500/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 04:41:08,945 >> Special tokens file saved in fv1_model_chunked/checkpoint-87500/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 04:41:15,492 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-78000] due to args.save_total_limit
 75%|███████▌  | 88000/117000 [1:04:24<5:33:39,  1.45it/s][INFO|trainer.py:4623] 2025-09-05 04:47:02,632 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 04:47:02,632 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 04:47:02,632 >>   Batch size = 16


{'loss': 1.0602, 'grad_norm': 0.17377632856369019, 'learning_rate': 1.4409068769289697e-05, 'epoch': 6.02}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.29it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.55it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.68it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.27it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.05it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.91it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.82it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.77it/s][A
  7%|▋         | 10/147 [00:01<00:28,  4.73it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.70it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.68it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.67it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.66it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.66it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.65it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.65it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.65it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.65it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.0606578588485718, 'eval_accuracy': 0.5934819660626113, 'eval_runtime': 34.9692, 'eval_samples_per_second': 66.916, 'eval_steps_per_second': 4.204, 'epoch': 6.02}


[INFO|modeling_utils.py:4297] 2025-09-05 04:47:40,679 >> Model weights saved in fv1_model_chunked/checkpoint-88000/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 04:47:40,686 >> tokenizer config file saved in fv1_model_chunked/checkpoint-88000/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 04:47:40,688 >> Special tokens file saved in fv1_model_chunked/checkpoint-88000/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 04:47:47,246 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-78500] due to args.save_total_limit
 76%|███████▌  | 88500/117000 [1:10:56<5:28:05,  1.45it/s][INFO|trainer.py:4623] 2025-09-05 04:53:33,803 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 04:53:33,804 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 04:53:33,804 >>   Batch size = 16


{'loss': 1.0602, 'grad_norm': 0.24015939235687256, 'learning_rate': 1.3940807670483813e-05, 'epoch': 6.05}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.30it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.55it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.68it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.26it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.04it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.91it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.82it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.77it/s][A
  7%|▋         | 10/147 [00:01<00:28,  4.73it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.70it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.69it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.68it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.67it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.66it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.65it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.65it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.65it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.64it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.0606483221054077, 'eval_accuracy': 0.5935717806685549, 'eval_runtime': 34.9767, 'eval_samples_per_second': 66.902, 'eval_steps_per_second': 4.203, 'epoch': 6.05}


[INFO|modeling_utils.py:4297] 2025-09-05 04:54:11,894 >> Model weights saved in fv1_model_chunked/checkpoint-88500/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 04:54:11,897 >> tokenizer config file saved in fv1_model_chunked/checkpoint-88500/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 04:54:11,899 >> Special tokens file saved in fv1_model_chunked/checkpoint-88500/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 04:54:18,055 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-79000] due to args.save_total_limit
 76%|███████▌  | 89000/117000 [1:17:25<5:23:54,  1.44it/s][INFO|trainer.py:4623] 2025-09-05 05:00:03,206 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 05:00:03,206 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 05:00:03,207 >>   Batch size = 16


{'loss': 1.0603, 'grad_norm': 0.260516494512558, 'learning_rate': 1.3479046035073239e-05, 'epoch': 6.09}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.30it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.56it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.69it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.28it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.05it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.91it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.83it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.77it/s][A
  7%|▋         | 10/147 [00:01<00:28,  4.73it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.70it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.68it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.67it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.66it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.66it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.65it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.66it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.65it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.65it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.0606478452682495, 'eval_accuracy': 0.5937321937321938, 'eval_runtime': 34.9814, 'eval_samples_per_second': 66.893, 'eval_steps_per_second': 4.202, 'epoch': 6.09}


[INFO|modeling_utils.py:4297] 2025-09-05 05:00:41,260 >> Model weights saved in fv1_model_chunked/checkpoint-89000/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 05:00:41,264 >> tokenizer config file saved in fv1_model_chunked/checkpoint-89000/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 05:00:41,266 >> Special tokens file saved in fv1_model_chunked/checkpoint-89000/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 05:00:47,851 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-79500] due to args.save_total_limit
 76%|███████▋  | 89500/117000 [1:23:55<5:14:44,  1.46it/s][INFO|trainer.py:4623] 2025-09-05 05:06:32,752 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 05:06:32,752 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 05:06:32,752 >>   Batch size = 16


{'loss': 1.0602, 'grad_norm': 0.3088438808917999, 'learning_rate': 1.3023867092954462e-05, 'epoch': 6.12}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.29it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.56it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.68it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.27it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.05it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.91it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.82it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.77it/s][A
  7%|▋         | 10/147 [00:01<00:28,  4.73it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.70it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.68it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.68it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.67it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.66it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.65it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.65it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.65it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.65it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.060655951499939, 'eval_accuracy': 0.5935914145591565, 'eval_runtime': 34.9655, 'eval_samples_per_second': 66.923, 'eval_steps_per_second': 4.204, 'epoch': 6.12}


[INFO|modeling_utils.py:4297] 2025-09-05 05:07:10,785 >> Model weights saved in fv1_model_chunked/checkpoint-89500/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 05:07:10,789 >> tokenizer config file saved in fv1_model_chunked/checkpoint-89500/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 05:07:10,791 >> Special tokens file saved in fv1_model_chunked/checkpoint-89500/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 05:07:17,413 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-80000] due to args.save_total_limit
 77%|███████▋  | 90000/117000 [1:30:24<5:09:11,  1.46it/s][INFO|trainer.py:4623] 2025-09-05 05:13:01,836 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 05:13:01,836 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 05:13:01,836 >>   Batch size = 16


{'loss': 1.0601, 'grad_norm': 0.2578507959842682, 'learning_rate': 1.25753528875312e-05, 'epoch': 6.15}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.29it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.56it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.68it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.27it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.05it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.91it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.83it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.77it/s][A
  7%|▋         | 10/147 [00:01<00:28,  4.73it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.70it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.68it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.68it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.67it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.66it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.66it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.65it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.65it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.65it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.060704231262207, 'eval_accuracy': 0.5936423791262501, 'eval_runtime': 35.1361, 'eval_samples_per_second': 66.598, 'eval_steps_per_second': 4.184, 'epoch': 6.15}


[INFO|modeling_utils.py:4297] 2025-09-05 05:13:40,042 >> Model weights saved in fv1_model_chunked/checkpoint-90000/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 05:13:40,048 >> tokenizer config file saved in fv1_model_chunked/checkpoint-90000/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 05:13:40,051 >> Special tokens file saved in fv1_model_chunked/checkpoint-90000/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 05:13:46,167 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-80500] due to args.save_total_limit
 77%|███████▋  | 90500/117000 [1:36:52<5:03:27,  1.46it/s][INFO|trainer.py:4623] 2025-09-05 05:19:30,183 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 05:19:30,183 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 05:19:30,183 >>   Batch size = 16


{'loss': 1.0603, 'grad_norm': 0.18125085532665253, 'learning_rate': 1.2133584260926422e-05, 'epoch': 6.19}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.28it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.55it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.68it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.27it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.04it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.90it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.82it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.77it/s][A
  7%|▋         | 10/147 [00:01<00:28,  4.73it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.70it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.68it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.67it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.66it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.66it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.65it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.65it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.64it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.64it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.060597538948059, 'eval_accuracy': 0.5935233225555806, 'eval_runtime': 34.9751, 'eval_samples_per_second': 66.905, 'eval_steps_per_second': 4.203, 'epoch': 6.19}


[INFO|modeling_utils.py:4297] 2025-09-05 05:20:08,244 >> Model weights saved in fv1_model_chunked/checkpoint-90500/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 05:20:08,253 >> tokenizer config file saved in fv1_model_chunked/checkpoint-90500/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 05:20:08,255 >> Special tokens file saved in fv1_model_chunked/checkpoint-90500/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 05:20:14,396 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-81000] due to args.save_total_limit
 78%|███████▊  | 91000/117000 [1:43:20<4:57:39,  1.46it/s][INFO|trainer.py:4623] 2025-09-05 05:25:58,391 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 05:25:58,391 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 05:25:58,391 >>   Batch size = 16


{'loss': 1.06, 'grad_norm': 0.1741383969783783, 'learning_rate': 1.169864083941103e-05, 'epoch': 6.22}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.28it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.55it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.67it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.26it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.04it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.90it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.82it/s][A
  6%|▌         | 9/147 [00:01<00:29,  4.76it/s][A
  7%|▋         | 10/147 [00:01<00:29,  4.72it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.70it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.68it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.67it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.67it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.66it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.66it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.65it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.64it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.64it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.0606398582458496, 'eval_accuracy': 0.5935763758344403, 'eval_runtime': 34.9758, 'eval_samples_per_second': 66.903, 'eval_steps_per_second': 4.203, 'epoch': 6.22}


[INFO|modeling_utils.py:4297] 2025-09-05 05:26:36,452 >> Model weights saved in fv1_model_chunked/checkpoint-91000/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 05:26:36,457 >> tokenizer config file saved in fv1_model_chunked/checkpoint-91000/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 05:26:36,459 >> Special tokens file saved in fv1_model_chunked/checkpoint-91000/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 05:26:42,789 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-81500] due to args.save_total_limit
 78%|███████▊  | 91500/117000 [1:49:48<4:51:50,  1.46it/s][INFO|trainer.py:4623] 2025-09-05 05:32:26,497 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 05:32:26,497 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 05:32:26,497 >>   Batch size = 16


{'loss': 1.0601, 'grad_norm': 0.17041286826133728, 'learning_rate': 1.127060101905174e-05, 'epoch': 6.26}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.31it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.56it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.68it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.27it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.05it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.91it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.83it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.77it/s][A
  7%|▋         | 10/147 [00:01<00:28,  4.73it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.70it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.68it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.68it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.67it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.66it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.65it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.65it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.65it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.65it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.060584545135498, 'eval_accuracy': 0.5937267630816018, 'eval_runtime': 34.924, 'eval_samples_per_second': 67.003, 'eval_steps_per_second': 4.209, 'epoch': 6.26}


[INFO|modeling_utils.py:4297] 2025-09-05 05:33:04,492 >> Model weights saved in fv1_model_chunked/checkpoint-91500/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 05:33:04,497 >> tokenizer config file saved in fv1_model_chunked/checkpoint-91500/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 05:33:04,498 >> Special tokens file saved in fv1_model_chunked/checkpoint-91500/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 05:33:11,047 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-82000] due to args.save_total_limit
 79%|███████▊  | 92000/117000 [1:56:17<4:46:18,  1.46it/s][INFO|trainer.py:4623] 2025-09-05 05:38:54,772 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 05:38:54,772 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 05:38:54,772 >>   Batch size = 16


{'loss': 1.06, 'grad_norm': 0.41466233134269714, 'learning_rate': 1.0849541951580605e-05, 'epoch': 6.29}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.30it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.55it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.68it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.27it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.05it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.91it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.82it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.77it/s][A
  7%|▋         | 10/147 [00:01<00:28,  4.73it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.70it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.68it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.68it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.67it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.66it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.66it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.65it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.65it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.65it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.060727596282959, 'eval_accuracy': 0.5934360144037564, 'eval_runtime': 34.9742, 'eval_samples_per_second': 66.907, 'eval_steps_per_second': 4.203, 'epoch': 6.29}


[INFO|modeling_utils.py:4297] 2025-09-05 05:39:32,827 >> Model weights saved in fv1_model_chunked/checkpoint-92000/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 05:39:32,831 >> tokenizer config file saved in fv1_model_chunked/checkpoint-92000/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 05:39:32,833 >> Special tokens file saved in fv1_model_chunked/checkpoint-92000/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 05:39:39,230 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-82500] due to args.save_total_limit
 79%|███████▉  | 92500/117000 [2:02:45<4:40:41,  1.45it/s][INFO|trainer.py:4623] 2025-09-05 05:45:23,014 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 05:45:23,014 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 05:45:23,014 >>   Batch size = 16


{'loss': 1.0601, 'grad_norm': 0.2379734218120575, 'learning_rate': 1.0435539530488814e-05, 'epoch': 6.32}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.29it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.57it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.69it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.28it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.05it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.91it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.82it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.77it/s][A
  7%|▋         | 10/147 [00:01<00:28,  4.73it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.70it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.69it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.67it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.66it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.66it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.66it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.66it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.65it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.65it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.060598373413086, 'eval_accuracy': 0.5939552681488165, 'eval_runtime': 34.9342, 'eval_samples_per_second': 66.983, 'eval_steps_per_second': 4.208, 'epoch': 6.32}


[INFO|modeling_utils.py:4297] 2025-09-05 05:46:01,017 >> Model weights saved in fv1_model_chunked/checkpoint-92500/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 05:46:01,022 >> tokenizer config file saved in fv1_model_chunked/checkpoint-92500/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 05:46:01,024 >> Special tokens file saved in fv1_model_chunked/checkpoint-92500/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 05:46:07,287 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-83000] due to args.save_total_limit
 79%|███████▉  | 93000/117000 [2:09:13<4:34:57,  1.45it/s][INFO|trainer.py:4623] 2025-09-05 05:51:51,084 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 05:51:51,084 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 05:51:51,084 >>   Batch size = 16


{'loss': 1.06, 'grad_norm': 0.14918920397758484, 'learning_rate': 1.002866837734739e-05, 'epoch': 6.36}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.29it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.55it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.69it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.27it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.05it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.91it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.82it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.77it/s][A
  7%|▋         | 10/147 [00:01<00:28,  4.73it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.70it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.69it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.67it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.67it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.66it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.66it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.66it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.65it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.65it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.060657262802124, 'eval_accuracy': 0.5935613371097243, 'eval_runtime': 34.9418, 'eval_samples_per_second': 66.968, 'eval_steps_per_second': 4.207, 'epoch': 6.36}


[INFO|modeling_utils.py:4297] 2025-09-05 05:52:29,105 >> Model weights saved in fv1_model_chunked/checkpoint-93000/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 05:52:29,107 >> tokenizer config file saved in fv1_model_chunked/checkpoint-93000/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 05:52:29,109 >> Special tokens file saved in fv1_model_chunked/checkpoint-93000/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 05:52:35,604 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-83500] due to args.save_total_limit
 80%|███████▉  | 93500/117000 [2:15:41<4:29:11,  1.45it/s][INFO|trainer.py:4623] 2025-09-05 05:58:19,639 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 05:58:19,639 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 05:58:19,639 >>   Batch size = 16


{'loss': 1.0599, 'grad_norm': 0.24117451906204224, 'learning_rate': 9.629001828357087e-06, 'epoch': 6.39}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.29it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.56it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.68it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.27it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.04it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.91it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.82it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.76it/s][A
  7%|▋         | 10/147 [00:01<00:29,  4.72it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.70it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.69it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.67it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.66it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.66it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.65it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.65it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.65it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.65it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.0606305599212646, 'eval_accuracy': 0.593793601858118, 'eval_runtime': 34.9808, 'eval_samples_per_second': 66.894, 'eval_steps_per_second': 4.202, 'epoch': 6.39}


[INFO|modeling_utils.py:4297] 2025-09-05 05:58:57,691 >> Model weights saved in fv1_model_chunked/checkpoint-93500/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 05:58:57,696 >> tokenizer config file saved in fv1_model_chunked/checkpoint-93500/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 05:58:57,701 >> Special tokens file saved in fv1_model_chunked/checkpoint-93500/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 05:59:03,848 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-73125] due to args.save_total_limit
 80%|████████  | 94000/117000 [2:22:09<4:23:25,  1.46it/s][INFO|trainer.py:4623] 2025-09-05 06:04:47,575 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 06:04:47,575 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 06:04:47,575 >>   Batch size = 16


{'loss': 1.0602, 'grad_norm': 0.19956018030643463, 'learning_rate': 9.236611921129857e-06, 'epoch': 6.43}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.30it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.56it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.68it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.27it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.05it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.91it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.82it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.77it/s][A
  7%|▋         | 10/147 [00:01<00:28,  4.73it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.71it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.69it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.67it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.66it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.66it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.65it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.65it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.65it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.65it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.0606213808059692, 'eval_accuracy': 0.5936549113968469, 'eval_runtime': 34.941, 'eval_samples_per_second': 66.97, 'eval_steps_per_second': 4.207, 'epoch': 6.43}


[INFO|modeling_utils.py:4297] 2025-09-05 06:05:25,587 >> Model weights saved in fv1_model_chunked/checkpoint-94000/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 06:05:25,592 >> tokenizer config file saved in fv1_model_chunked/checkpoint-94000/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 06:05:25,594 >> Special tokens file saved in fv1_model_chunked/checkpoint-94000/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 06:05:32,045 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-84000] due to args.save_total_limit
 81%|████████  | 94500/117000 [2:28:38<4:17:31,  1.46it/s][INFO|trainer.py:4623] 2025-09-05 06:11:15,797 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 06:11:15,797 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 06:11:15,798 >>   Batch size = 16


{'loss': 1.0599, 'grad_norm': 0.22575746476650238, 'learning_rate': 8.851569381704583e-06, 'epoch': 6.46}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.30it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.56it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.68it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.27it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.05it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.91it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.82it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.77it/s][A
  7%|▋         | 10/147 [00:01<00:28,  4.73it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.70it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.69it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.68it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.67it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.66it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.66it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.65it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.65it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.65it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.0605980157852173, 'eval_accuracy': 0.593560919367371, 'eval_runtime': 34.9948, 'eval_samples_per_second': 66.867, 'eval_steps_per_second': 4.201, 'epoch': 6.46}


[INFO|modeling_utils.py:4297] 2025-09-05 06:11:53,864 >> Model weights saved in fv1_model_chunked/checkpoint-94500/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 06:11:53,869 >> tokenizer config file saved in fv1_model_chunked/checkpoint-94500/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 06:11:53,871 >> Special tokens file saved in fv1_model_chunked/checkpoint-94500/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 06:11:59,990 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-84500] due to args.save_total_limit
 81%|████████  | 95000/117000 [2:35:05<4:11:48,  1.46it/s][INFO|trainer.py:4623] 2025-09-05 06:17:43,495 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 06:17:43,495 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 06:17:43,495 >>   Batch size = 16


{'loss': 1.0598, 'grad_norm': 0.20320767164230347, 'learning_rate': 8.4739436117991e-06, 'epoch': 6.5}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.31it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.55it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.68it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.27it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.05it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.91it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.82it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.77it/s][A
  7%|▋         | 10/147 [00:01<00:28,  4.73it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.70it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.69it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.68it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.67it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.66it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.66it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.65it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.65it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.65it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.0605324506759644, 'eval_accuracy': 0.5936770517415678, 'eval_runtime': 35.0338, 'eval_samples_per_second': 66.793, 'eval_steps_per_second': 4.196, 'epoch': 6.5}


[INFO|modeling_utils.py:4297] 2025-09-05 06:18:21,601 >> Model weights saved in fv1_model_chunked/checkpoint-95000/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 06:18:21,607 >> tokenizer config file saved in fv1_model_chunked/checkpoint-95000/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 06:18:21,609 >> Special tokens file saved in fv1_model_chunked/checkpoint-95000/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 06:18:28,197 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-85000] due to args.save_total_limit
 82%|████████▏ | 95500/117000 [2:41:33<4:06:57,  1.45it/s][INFO|trainer.py:4623] 2025-09-05 06:24:11,535 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 06:24:11,535 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 06:24:11,535 >>   Batch size = 16


{'loss': 1.0598, 'grad_norm': 0.1525372713804245, 'learning_rate': 8.103802676300864e-06, 'epoch': 6.53}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.31it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.56it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.69it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.27it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.05it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.91it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.82it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.77it/s][A
  7%|▋         | 10/147 [00:01<00:28,  4.72it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.70it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.68it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.67it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.67it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.66it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.66it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.65it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.65it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.65it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.0605638027191162, 'eval_accuracy': 0.5938654535428729, 'eval_runtime': 34.9811, 'eval_samples_per_second': 66.893, 'eval_steps_per_second': 4.202, 'epoch': 6.53}


[INFO|modeling_utils.py:4297] 2025-09-05 06:24:49,590 >> Model weights saved in fv1_model_chunked/checkpoint-95500/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 06:24:49,594 >> tokenizer config file saved in fv1_model_chunked/checkpoint-95500/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 06:24:49,596 >> Special tokens file saved in fv1_model_chunked/checkpoint-95500/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 06:24:55,945 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-85500] due to args.save_total_limit
 82%|████████▏ | 96000/117000 [2:48:01<4:00:08,  1.46it/s][INFO|trainer.py:4623] 2025-09-05 06:30:39,268 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 06:30:39,268 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 06:30:39,268 >>   Batch size = 16


{'loss': 1.0598, 'grad_norm': 0.28031811118125916, 'learning_rate': 7.741213290998733e-06, 'epoch': 6.56}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.30it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.56it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.68it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.27it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.04it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.91it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.82it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.77it/s][A
  7%|▋         | 10/147 [00:01<00:28,  4.73it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.70it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.69it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.67it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.66it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.66it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.65it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.65it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.65it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.64it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.0605957508087158, 'eval_accuracy': 0.5933913159719612, 'eval_runtime': 34.9554, 'eval_samples_per_second': 66.942, 'eval_steps_per_second': 4.205, 'epoch': 6.56}


[INFO|modeling_utils.py:4297] 2025-09-05 06:31:17,291 >> Model weights saved in fv1_model_chunked/checkpoint-96000/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 06:31:17,294 >> tokenizer config file saved in fv1_model_chunked/checkpoint-96000/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 06:31:17,296 >> Special tokens file saved in fv1_model_chunked/checkpoint-96000/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 06:31:24,031 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-86000] due to args.save_total_limit
 82%|████████▏ | 96500/117000 [2:54:29<3:54:34,  1.46it/s][INFO|trainer.py:4623] 2025-09-05 06:37:07,351 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 06:37:07,351 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 06:37:07,351 >>   Batch size = 16


{'loss': 1.0599, 'grad_norm': 0.1823129802942276, 'learning_rate': 7.3862408105577705e-06, 'epoch': 6.6}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.27it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.55it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.68it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.27it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.05it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.91it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.83it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.77it/s][A
  7%|▋         | 10/147 [00:01<00:28,  4.73it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.71it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.69it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.68it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.67it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.66it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.66it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.65it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.65it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.65it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.060545563697815, 'eval_accuracy': 0.5940141698206214, 'eval_runtime': 34.9254, 'eval_samples_per_second': 67.0, 'eval_steps_per_second': 4.209, 'epoch': 6.6}


[INFO|modeling_utils.py:4297] 2025-09-05 06:37:45,345 >> Model weights saved in fv1_model_chunked/checkpoint-96500/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 06:37:45,350 >> tokenizer config file saved in fv1_model_chunked/checkpoint-96500/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 06:37:45,352 >> Special tokens file saved in fv1_model_chunked/checkpoint-96500/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 06:37:51,950 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-86500] due to args.save_total_limit
 83%|████████▎ | 97000/117000 [3:00:57<3:48:53,  1.46it/s][INFO|trainer.py:4623] 2025-09-05 06:43:35,184 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 06:43:35,184 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 06:43:35,184 >>   Batch size = 16


{'loss': 1.0599, 'grad_norm': 0.16982440650463104, 'learning_rate': 7.038949216739465e-06, 'epoch': 6.63}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.30it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.56it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.69it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.28it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.05it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.91it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.83it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.77it/s][A
  7%|▋         | 10/147 [00:01<00:28,  4.73it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.70it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.69it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.67it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.67it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.66it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.66it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.65it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.65it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.65it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.0605396032333374, 'eval_accuracy': 0.5938132357487196, 'eval_runtime': 34.9208, 'eval_samples_per_second': 67.009, 'eval_steps_per_second': 4.21, 'epoch': 6.63}


[INFO|modeling_utils.py:4297] 2025-09-05 06:44:13,178 >> Model weights saved in fv1_model_chunked/checkpoint-97000/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 06:44:13,187 >> tokenizer config file saved in fv1_model_chunked/checkpoint-97000/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 06:44:13,189 >> Special tokens file saved in fv1_model_chunked/checkpoint-97000/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 06:44:19,662 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-87000] due to args.save_total_limit
 83%|████████▎ | 97500/117000 [3:07:25<3:43:11,  1.46it/s][INFO|trainer.py:4623] 2025-09-05 06:50:03,015 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 06:50:03,015 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 06:50:03,015 >>   Batch size = 16


{'loss': 1.0599, 'grad_norm': 0.144741952419281, 'learning_rate': 6.6994011068693685e-06, 'epoch': 6.67}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.29it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.55it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.68it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.28it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.05it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.91it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.82it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.76it/s][A
  7%|▋         | 10/147 [00:01<00:28,  4.73it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.70it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.68it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.67it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.66it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.65it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.65it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.65it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.65it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.65it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.0605372190475464, 'eval_accuracy': 0.5935847306815049, 'eval_runtime': 34.9839, 'eval_samples_per_second': 66.888, 'eval_steps_per_second': 4.202, 'epoch': 6.67}


[INFO|modeling_utils.py:4297] 2025-09-05 06:50:41,068 >> Model weights saved in fv1_model_chunked/checkpoint-97500/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 06:50:41,073 >> tokenizer config file saved in fv1_model_chunked/checkpoint-97500/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 06:50:41,075 >> Special tokens file saved in fv1_model_chunked/checkpoint-97500/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 06:50:47,646 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-87500] due to args.save_total_limit
 84%|████████▍ | 98000/117000 [3:13:53<3:37:22,  1.46it/s][INFO|trainer.py:4623] 2025-09-05 06:56:30,860 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 06:56:30,860 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 06:56:30,860 >>   Batch size = 16


{'loss': 1.0599, 'grad_norm': 0.18166029453277588, 'learning_rate': 6.367657682554312e-06, 'epoch': 6.7}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.30it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.56it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.68it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.27it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.04it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.91it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.83it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.77it/s][A
  7%|▋         | 10/147 [00:01<00:28,  4.73it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.71it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.69it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.68it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.67it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.66it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.66it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.65it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.65it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.65it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.0605300664901733, 'eval_accuracy': 0.5939548504064633, 'eval_runtime': 34.9676, 'eval_samples_per_second': 66.919, 'eval_steps_per_second': 4.204, 'epoch': 6.7}


[INFO|modeling_utils.py:4297] 2025-09-05 06:57:08,896 >> Model weights saved in fv1_model_chunked/checkpoint-98000/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 06:57:08,902 >> tokenizer config file saved in fv1_model_chunked/checkpoint-98000/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 06:57:08,904 >> Special tokens file saved in fv1_model_chunked/checkpoint-98000/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 06:57:15,470 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-88000] due to args.save_total_limit
 84%|████████▍ | 98500/117000 [3:20:21<3:31:49,  1.46it/s][INFO|trainer.py:4623] 2025-09-05 07:02:59,026 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 07:02:59,026 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 07:02:59,026 >>   Batch size = 16


{'loss': 1.0596, 'grad_norm': 0.21986140310764313, 'learning_rate': 6.043778738651102e-06, 'epoch': 6.74}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.34it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.57it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.69it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.28it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.05it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.91it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.83it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.77it/s][A
  7%|▋         | 10/147 [00:01<00:28,  4.73it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.70it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.69it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.67it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.66it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.66it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.65it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.65it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.64it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.64it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.060585618019104, 'eval_accuracy': 0.5937321937321938, 'eval_runtime': 34.9442, 'eval_samples_per_second': 66.964, 'eval_steps_per_second': 4.207, 'epoch': 6.74}


[INFO|modeling_utils.py:4297] 2025-09-05 07:03:37,039 >> Model weights saved in fv1_model_chunked/checkpoint-98500/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 07:03:37,043 >> tokenizer config file saved in fv1_model_chunked/checkpoint-98500/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 07:03:37,045 >> Special tokens file saved in fv1_model_chunked/checkpoint-98500/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 07:03:43,601 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-88500] due to args.save_total_limit
 85%|████████▍ | 99000/117000 [3:26:49<3:26:06,  1.46it/s][INFO|trainer.py:4623] 2025-09-05 07:09:27,281 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 07:09:27,281 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 07:09:27,281 >>   Batch size = 16


{'loss': 1.0599, 'grad_norm': 0.22011613845825195, 'learning_rate': 5.7278226524889346e-06, 'epoch': 6.77}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.30it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.56it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.69it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.28it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.05it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.91it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.83it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.77it/s][A
  7%|▋         | 10/147 [00:01<00:28,  4.73it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.71it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.69it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.67it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.67it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.66it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.66it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.65it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.65it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.65it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.0605515241622925, 'eval_accuracy': 0.5937392953521986, 'eval_runtime': 34.9562, 'eval_samples_per_second': 66.941, 'eval_steps_per_second': 4.205, 'epoch': 6.77}


[INFO|modeling_utils.py:4297] 2025-09-05 07:10:05,304 >> Model weights saved in fv1_model_chunked/checkpoint-99000/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 07:10:05,309 >> tokenizer config file saved in fv1_model_chunked/checkpoint-99000/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 07:10:05,311 >> Special tokens file saved in fv1_model_chunked/checkpoint-99000/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 07:10:11,875 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-89000] due to args.save_total_limit
 85%|████████▌ | 99500/117000 [3:33:17<3:20:17,  1.46it/s][INFO|trainer.py:4623] 2025-09-05 07:15:55,330 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 07:15:55,330 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 07:15:55,330 >>   Batch size = 16


{'loss': 1.0599, 'grad_norm': 0.1544286161661148, 'learning_rate': 5.419846373347143e-06, 'epoch': 6.8}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.31it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.56it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.68it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.28it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.05it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.91it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.83it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.77it/s][A
  7%|▋         | 10/147 [00:01<00:28,  4.73it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.70it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.69it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.67it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.66it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.66it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.65it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.65it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.65it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.65it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.0604982376098633, 'eval_accuracy': 0.5934894854249693, 'eval_runtime': 34.975, 'eval_samples_per_second': 66.905, 'eval_steps_per_second': 4.203, 'epoch': 6.8}


[INFO|modeling_utils.py:4297] 2025-09-05 07:16:33,373 >> Model weights saved in fv1_model_chunked/checkpoint-99500/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 07:16:33,377 >> tokenizer config file saved in fv1_model_chunked/checkpoint-99500/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 07:16:33,379 >> Special tokens file saved in fv1_model_chunked/checkpoint-99500/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 07:16:39,946 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-89500] due to args.save_total_limit
 85%|████████▌ | 100000/117000 [3:39:45<3:14:34,  1.46it/s][INFO|trainer.py:4623] 2025-09-05 07:22:23,292 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 07:22:23,293 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 07:22:23,293 >>   Batch size = 16


{'loss': 1.0598, 'grad_norm': 0.2018319070339203, 'learning_rate': 5.119905412190429e-06, 'epoch': 6.84}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.31it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.56it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.69it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.27it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.05it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.91it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.83it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.77it/s][A
  7%|▋         | 10/147 [00:01<00:28,  4.73it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.70it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.68it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.67it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.67it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.66it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.66it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.65it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.65it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.65it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.0605380535125732, 'eval_accuracy': 0.5937062937062937, 'eval_runtime': 34.9641, 'eval_samples_per_second': 66.926, 'eval_steps_per_second': 4.204, 'epoch': 6.84}


[INFO|modeling_utils.py:4297] 2025-09-05 07:23:01,324 >> Model weights saved in fv1_model_chunked/checkpoint-100000/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 07:23:01,326 >> tokenizer config file saved in fv1_model_chunked/checkpoint-100000/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 07:23:01,327 >> Special tokens file saved in fv1_model_chunked/checkpoint-100000/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 07:23:07,884 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-90000] due to args.save_total_limit
 86%|████████▌ | 100500/117000 [3:46:13<3:08:46,  1.46it/s][INFO|trainer.py:4623] 2025-09-05 07:28:51,298 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 07:28:51,298 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 07:28:51,299 >>   Batch size = 16


{'loss': 1.0599, 'grad_norm': 0.19933351874351501, 'learning_rate': 4.828053831663337e-06, 'epoch': 6.87}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.30it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.56it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.68it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.27it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.05it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.92it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.83it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.77it/s][A
  7%|▋         | 10/147 [00:01<00:28,  4.73it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.70it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.69it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.68it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.67it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.66it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.66it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.65it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.65it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.65it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.0605138540267944, 'eval_accuracy': 0.5934556482943579, 'eval_runtime': 34.9544, 'eval_samples_per_second': 66.944, 'eval_steps_per_second': 4.205, 'epoch': 6.87}


[INFO|modeling_utils.py:4297] 2025-09-05 07:29:29,324 >> Model weights saved in fv1_model_chunked/checkpoint-100500/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 07:29:29,331 >> tokenizer config file saved in fv1_model_chunked/checkpoint-100500/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 07:29:29,333 >> Special tokens file saved in fv1_model_chunked/checkpoint-100500/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 07:29:35,369 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-90500] due to args.save_total_limit
 86%|████████▋ | 101000/117000 [3:52:41<3:03:01,  1.46it/s][INFO|trainer.py:4623] 2025-09-05 07:35:18,682 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 07:35:18,682 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 07:35:18,682 >>   Batch size = 16


{'loss': 1.06, 'grad_norm': 0.27455878257751465, 'learning_rate': 4.54434423634581e-06, 'epoch': 6.91}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.30it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.56it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.69it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.28it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.05it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.91it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.83it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.77it/s][A
  7%|▋         | 10/147 [00:01<00:28,  4.73it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.70it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.69it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.68it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.67it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.66it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.66it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.65it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.65it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.65it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.0604931116104126, 'eval_accuracy': 0.5938796567828826, 'eval_runtime': 34.9606, 'eval_samples_per_second': 66.932, 'eval_steps_per_second': 4.205, 'epoch': 6.91}


[INFO|modeling_utils.py:4297] 2025-09-05 07:35:56,716 >> Model weights saved in fv1_model_chunked/checkpoint-101000/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 07:35:56,719 >> tokenizer config file saved in fv1_model_chunked/checkpoint-101000/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 07:35:56,721 >> Special tokens file saved in fv1_model_chunked/checkpoint-101000/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 07:36:03,174 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-91000] due to args.save_total_limit
 87%|████████▋ | 101500/117000 [3:59:08<2:57:13,  1.46it/s][INFO|trainer.py:4623] 2025-09-05 07:41:46,327 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 07:41:46,327 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 07:41:46,327 >>   Batch size = 16


{'loss': 1.0599, 'grad_norm': 0.21137623488903046, 'learning_rate': 4.268827763271421e-06, 'epoch': 6.94}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.30it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.57it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.69it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.28it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.05it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.92it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.83it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.77it/s][A
  7%|▋         | 10/147 [00:01<00:28,  4.73it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.71it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.69it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.68it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.67it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.66it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.67it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.66it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.66it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.65it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.0605151653289795, 'eval_accuracy': 0.593464838626129, 'eval_runtime': 34.9543, 'eval_samples_per_second': 66.945, 'eval_steps_per_second': 4.205, 'epoch': 6.94}


[INFO|modeling_utils.py:4297] 2025-09-05 07:42:24,358 >> Model weights saved in fv1_model_chunked/checkpoint-101500/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 07:42:24,361 >> tokenizer config file saved in fv1_model_chunked/checkpoint-101500/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 07:42:24,362 >> Special tokens file saved in fv1_model_chunked/checkpoint-101500/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 07:42:30,910 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-91500] due to args.save_total_limit
 87%|████████▋ | 102000/117000 [4:05:36<2:51:36,  1.46it/s][INFO|trainer.py:4623] 2025-09-05 07:48:14,041 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 07:48:14,041 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 07:48:14,041 >>   Batch size = 16


{'loss': 1.0596, 'grad_norm': 0.16543954610824585, 'learning_rate': 4.001554072710306e-06, 'epoch': 6.97}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.27it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.56it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.68it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.27it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.05it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.91it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.83it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.77it/s][A
  7%|▋         | 10/147 [00:01<00:28,  4.73it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.71it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.69it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.68it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.67it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.67it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.66it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.66it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.66it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.65it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.0604782104492188, 'eval_accuracy': 0.5937689550592776, 'eval_runtime': 34.9408, 'eval_samples_per_second': 66.97, 'eval_steps_per_second': 4.207, 'epoch': 6.97}


[INFO|modeling_utils.py:4297] 2025-09-05 07:48:52,058 >> Model weights saved in fv1_model_chunked/checkpoint-102000/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 07:48:52,063 >> tokenizer config file saved in fv1_model_chunked/checkpoint-102000/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 07:48:52,065 >> Special tokens file saved in fv1_model_chunked/checkpoint-102000/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 07:48:58,614 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-92000] due to args.save_total_limit
 88%|████████▊ | 102500/117000 [4:12:04<2:45:58,  1.46it/s][INFO|trainer.py:4623] 2025-09-05 07:54:41,867 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 07:54:41,867 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 07:54:41,867 >>   Batch size = 16


{'loss': 1.0596, 'grad_norm': 0.16057854890823364, 'learning_rate': 3.742571339218137e-06, 'epoch': 7.01}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.28it/s][A
  2%|▏         | 3/147 [00:00<00:22,  6.54it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.68it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.27it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.04it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.90it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.82it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.76it/s][A
  7%|▋         | 10/147 [00:01<00:29,  4.72it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.69it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.68it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.67it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.66it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.65it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.65it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.65it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.65it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.65it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.0604876279830933, 'eval_accuracy': 0.5936482275191952, 'eval_runtime': 35.0026, 'eval_samples_per_second': 66.852, 'eval_steps_per_second': 4.2, 'epoch': 7.01}


[INFO|modeling_utils.py:4297] 2025-09-05 07:55:19,937 >> Model weights saved in fv1_model_chunked/checkpoint-102500/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 07:55:19,942 >> tokenizer config file saved in fv1_model_chunked/checkpoint-102500/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 07:55:19,944 >> Special tokens file saved in fv1_model_chunked/checkpoint-102500/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 07:55:26,539 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-92500] due to args.save_total_limit
 88%|████████▊ | 103000/117000 [4:18:32<2:40:17,  1.46it/s][INFO|trainer.py:4623] 2025-09-05 08:01:09,924 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 08:01:09,924 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 08:01:09,924 >>   Batch size = 16


{'loss': 1.0598, 'grad_norm': 0.24393248558044434, 'learning_rate': 3.4919262429529308e-06, 'epoch': 7.04}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.28it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.55it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.67it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.26it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.05it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.91it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.82it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.77it/s][A
  7%|▋         | 10/147 [00:01<00:28,  4.73it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.70it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.68it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.67it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.66it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.66it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.65it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.65it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.65it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.65it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.0604981184005737, 'eval_accuracy': 0.5936774694839211, 'eval_runtime': 34.9588, 'eval_samples_per_second': 66.936, 'eval_steps_per_second': 4.205, 'epoch': 7.04}


[INFO|modeling_utils.py:4297] 2025-09-05 08:01:47,956 >> Model weights saved in fv1_model_chunked/checkpoint-103000/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 08:01:47,963 >> tokenizer config file saved in fv1_model_chunked/checkpoint-103000/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 08:01:47,965 >> Special tokens file saved in fv1_model_chunked/checkpoint-103000/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 08:01:54,538 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-93000] due to args.save_total_limit
 88%|████████▊ | 103500/117000 [4:25:00<2:34:43,  1.45it/s][INFO|trainer.py:4623] 2025-09-05 08:07:38,213 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 08:07:38,213 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 08:07:38,213 >>   Batch size = 16


{'loss': 1.0595, 'grad_norm': 0.24771809577941895, 'learning_rate': 3.2496639612612512e-06, 'epoch': 7.08}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.27it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.55it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.68it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.27it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.05it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.91it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.82it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.77it/s][A
  7%|▋         | 10/147 [00:01<00:28,  4.73it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.70it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.68it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.67it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.66it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.65it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.65it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.65it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.65it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.65it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.0604585409164429, 'eval_accuracy': 0.5938207551110777, 'eval_runtime': 34.9762, 'eval_samples_per_second': 66.903, 'eval_steps_per_second': 4.203, 'epoch': 7.08}


[INFO|modeling_utils.py:4297] 2025-09-05 08:08:16,256 >> Model weights saved in fv1_model_chunked/checkpoint-103500/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 08:08:16,261 >> tokenizer config file saved in fv1_model_chunked/checkpoint-103500/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 08:08:16,263 >> Special tokens file saved in fv1_model_chunked/checkpoint-103500/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 08:08:22,820 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-93500] due to args.save_total_limit
 89%|████████▉ | 104000/117000 [4:31:28<2:28:57,  1.45it/s][INFO|trainer.py:4623] 2025-09-05 08:14:06,528 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 08:14:06,528 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 08:14:06,528 >>   Batch size = 16


{'loss': 1.0597, 'grad_norm': 0.15931540727615356, 'learning_rate': 3.015828160535217e-06, 'epoch': 7.11}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.30it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.56it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.69it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.28it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.05it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.91it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.82it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.76it/s][A
  7%|▋         | 10/147 [00:01<00:29,  4.72it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.70it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.68it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.67it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.66it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.65it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.66it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.65it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.65it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.65it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.0605061054229736, 'eval_accuracy': 0.5937518276227953, 'eval_runtime': 34.9738, 'eval_samples_per_second': 66.907, 'eval_steps_per_second': 4.203, 'epoch': 7.11}


[INFO|modeling_utils.py:4297] 2025-09-05 08:14:44,579 >> Model weights saved in fv1_model_chunked/checkpoint-104000/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 08:14:44,584 >> tokenizer config file saved in fv1_model_chunked/checkpoint-104000/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 08:14:44,586 >> Special tokens file saved in fv1_model_chunked/checkpoint-104000/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 08:14:50,664 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-94000] due to args.save_total_limit
 89%|████████▉ | 104500/117000 [4:37:56<2:23:12,  1.45it/s][INFO|trainer.py:4623] 2025-09-05 08:20:34,435 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 08:20:34,435 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 08:20:34,435 >>   Batch size = 16


{'loss': 1.0597, 'grad_norm': 0.20362937450408936, 'learning_rate': 2.790460988341909e-06, 'epoch': 7.15}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.33it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.56it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.68it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.27it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.04it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.91it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.82it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.77it/s][A
  7%|▋         | 10/147 [00:01<00:28,  4.73it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.70it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.68it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.67it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.66it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.66it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.65it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.65it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.65it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.64it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.060465693473816, 'eval_accuracy': 0.593625669432121, 'eval_runtime': 35.0152, 'eval_samples_per_second': 66.828, 'eval_steps_per_second': 4.198, 'epoch': 7.15}


[INFO|modeling_utils.py:4297] 2025-09-05 08:21:12,524 >> Model weights saved in fv1_model_chunked/checkpoint-104500/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 08:21:12,531 >> tokenizer config file saved in fv1_model_chunked/checkpoint-104500/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 08:21:12,533 >> Special tokens file saved in fv1_model_chunked/checkpoint-104500/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 08:21:19,113 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-94500] due to args.save_total_limit
 90%|████████▉ | 105000/117000 [4:44:25<2:17:33,  1.45it/s][INFO|trainer.py:4623] 2025-09-05 08:27:02,928 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 08:27:02,929 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 08:27:02,929 >>   Batch size = 16


{'loss': 1.0596, 'grad_norm': 0.12847255170345306, 'learning_rate': 2.5736030658264854e-06, 'epoch': 7.18}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.29it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.55it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.68it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.27it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.05it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.91it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.82it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.77it/s][A
  7%|▋         | 10/147 [00:01<00:28,  4.73it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.70it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.68it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.67it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.67it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.66it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.66it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.65it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.65it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.65it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.0604742765426636, 'eval_accuracy': 0.593823261565197, 'eval_runtime': 34.975, 'eval_samples_per_second': 66.905, 'eval_steps_per_second': 4.203, 'epoch': 7.18}


[INFO|modeling_utils.py:4297] 2025-09-05 08:27:40,978 >> Model weights saved in fv1_model_chunked/checkpoint-105000/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 08:27:40,982 >> tokenizer config file saved in fv1_model_chunked/checkpoint-105000/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 08:27:40,985 >> Special tokens file saved in fv1_model_chunked/checkpoint-105000/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 08:27:47,434 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-95000] due to args.save_total_limit
 90%|█████████ | 105500/117000 [4:50:53<2:11:45,  1.45it/s][INFO|trainer.py:4623] 2025-09-05 08:33:31,088 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 08:33:31,088 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 08:33:31,088 >>   Batch size = 16


{'loss': 1.0596, 'grad_norm': 0.20229926705360413, 'learning_rate': 2.3652934803904903e-06, 'epoch': 7.21}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.29it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.55it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.68it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.27it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.04it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.91it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.82it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.77it/s][A
  7%|▋         | 10/147 [00:01<00:28,  4.73it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.70it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.69it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.67it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.66it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.65it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.65it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.65it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.65it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.65it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.0604523420333862, 'eval_accuracy': 0.5940200182135666, 'eval_runtime': 34.9825, 'eval_samples_per_second': 66.891, 'eval_steps_per_second': 4.202, 'epoch': 7.21}


[INFO|modeling_utils.py:4297] 2025-09-05 08:34:09,142 >> Model weights saved in fv1_model_chunked/checkpoint-105500/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 08:34:09,146 >> tokenizer config file saved in fv1_model_chunked/checkpoint-105500/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 08:34:09,148 >> Special tokens file saved in fv1_model_chunked/checkpoint-105500/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 08:34:15,514 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-95500] due to args.save_total_limit
 91%|█████████ | 106000/117000 [4:57:21<2:06:00,  1.46it/s][INFO|trainer.py:4623] 2025-09-05 08:39:59,300 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 08:39:59,300 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 08:39:59,300 >>   Batch size = 16


{'loss': 1.0593, 'grad_norm': 0.17445948719978333, 'learning_rate': 2.1655697786465148e-06, 'epoch': 7.25}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.29it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.55it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.68it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.27it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.04it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.91it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.82it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.77it/s][A
  7%|▋         | 10/147 [00:01<00:28,  4.73it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.70it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.66it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.65it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.65it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.65it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.64it/s][A
 12%|█▏        | 17/147 [00:03<00:28,  4.64it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.64it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.64it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.060457706451416, 'eval_accuracy': 0.5939406471664537, 'eval_runtime': 34.9825, 'eval_samples_per_second': 66.891, 'eval_steps_per_second': 4.202, 'epoch': 7.25}


[INFO|modeling_utils.py:4297] 2025-09-05 08:40:37,353 >> Model weights saved in fv1_model_chunked/checkpoint-106000/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 08:40:37,357 >> tokenizer config file saved in fv1_model_chunked/checkpoint-106000/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 08:40:37,359 >> Special tokens file saved in fv1_model_chunked/checkpoint-106000/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 08:40:43,418 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-96000] due to args.save_total_limit
 91%|█████████ | 106500/117000 [5:03:49<2:00:16,  1.46it/s][INFO|trainer.py:4623] 2025-09-05 08:46:27,042 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 08:46:27,042 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 08:46:27,042 >>   Batch size = 16


{'loss': 1.0598, 'grad_norm': 0.2984573245048523, 'learning_rate': 1.974467959650689e-06, 'epoch': 7.28}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.29it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.55it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.68it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.27it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.05it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.91it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.82it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.77it/s][A
  7%|▋         | 10/147 [00:01<00:28,  4.74it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.71it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.69it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.67it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.67it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.66it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.66it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.65it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.65it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.65it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.0604686737060547, 'eval_accuracy': 0.5938529212722761, 'eval_runtime': 34.9564, 'eval_samples_per_second': 66.94, 'eval_steps_per_second': 4.205, 'epoch': 7.28}


[INFO|modeling_utils.py:4297] 2025-09-05 08:47:05,067 >> Model weights saved in fv1_model_chunked/checkpoint-106500/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 08:47:05,072 >> tokenizer config file saved in fv1_model_chunked/checkpoint-106500/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 08:47:05,074 >> Special tokens file saved in fv1_model_chunked/checkpoint-106500/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 08:47:11,859 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-96500] due to args.save_total_limit
 91%|█████████▏| 107000/117000 [5:10:17<1:54:34,  1.45it/s][INFO|trainer.py:4623] 2025-09-05 08:52:55,447 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 08:52:55,447 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 08:52:55,447 >>   Batch size = 16


{'loss': 1.06, 'grad_norm': 0.18628136813640594, 'learning_rate': 1.7920224684140563e-06, 'epoch': 7.32}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.33it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.56it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.68it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.27it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.05it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.91it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.82it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.77it/s][A
  7%|▋         | 10/147 [00:01<00:28,  4.73it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.70it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.68it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.67it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.66it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.66it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.66it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.65it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.65it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.65it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.06045401096344, 'eval_accuracy': 0.5937764744216357, 'eval_runtime': 34.9597, 'eval_samples_per_second': 66.934, 'eval_steps_per_second': 4.205, 'epoch': 7.32}


[INFO|modeling_utils.py:4297] 2025-09-05 08:53:33,478 >> Model weights saved in fv1_model_chunked/checkpoint-107000/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 08:53:33,481 >> tokenizer config file saved in fv1_model_chunked/checkpoint-107000/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 08:53:33,483 >> Special tokens file saved in fv1_model_chunked/checkpoint-107000/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 08:53:40,058 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-97000] due to args.save_total_limit
 92%|█████████▏| 107500/117000 [5:16:46<1:48:49,  1.45it/s][INFO|trainer.py:4623] 2025-09-05 08:59:23,763 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 08:59:23,763 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 08:59:23,763 >>   Batch size = 16


{'loss': 1.0595, 'grad_norm': 0.31037625670433044, 'learning_rate': 1.6182661896939954e-06, 'epoch': 7.35}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.31it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.56it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.68it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.27it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.04it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.91it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.82it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.77it/s][A
  7%|▋         | 10/147 [00:01<00:28,  4.73it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.71it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.69it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.68it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.67it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.66it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.66it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.65it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.65it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.65it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.0604653358459473, 'eval_accuracy': 0.5938328696393212, 'eval_runtime': 35.211, 'eval_samples_per_second': 66.457, 'eval_steps_per_second': 4.175, 'epoch': 7.35}


[INFO|modeling_utils.py:4297] 2025-09-05 09:00:02,046 >> Model weights saved in fv1_model_chunked/checkpoint-107500/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 09:00:02,049 >> tokenizer config file saved in fv1_model_chunked/checkpoint-107500/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 09:00:02,051 >> Special tokens file saved in fv1_model_chunked/checkpoint-107500/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 09:00:08,100 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-97500] due to args.save_total_limit
 92%|█████████▏| 108000/117000 [5:23:14<1:43:07,  1.45it/s][INFO|trainer.py:4623] 2025-09-05 09:05:51,774 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 09:05:51,774 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 09:05:51,774 >>   Batch size = 16


{'loss': 1.0598, 'grad_norm': 0.14515499770641327, 'learning_rate': 1.4532304420670162e-06, 'epoch': 7.38}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.30it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.55it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.68it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.28it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.05it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.91it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.82it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.76it/s][A
  7%|▋         | 10/147 [00:01<00:29,  4.72it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.70it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.68it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.67it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.66it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.66it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.65it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.65it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.66it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.65it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.060451626777649, 'eval_accuracy': 0.5937468147145567, 'eval_runtime': 34.9762, 'eval_samples_per_second': 66.903, 'eval_steps_per_second': 4.203, 'epoch': 7.38}


[INFO|modeling_utils.py:4297] 2025-09-05 09:06:29,826 >> Model weights saved in fv1_model_chunked/checkpoint-108000/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 09:06:29,832 >> tokenizer config file saved in fv1_model_chunked/checkpoint-108000/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 09:06:29,834 >> Special tokens file saved in fv1_model_chunked/checkpoint-108000/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 09:06:36,426 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-98000] due to args.save_total_limit
 93%|█████████▎| 108500/117000 [5:29:42<1:37:25,  1.45it/s][INFO|trainer.py:4623] 2025-09-05 09:12:20,240 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 09:12:20,240 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 09:12:20,240 >>   Batch size = 16


{'loss': 1.0596, 'grad_norm': 0.15235325694084167, 'learning_rate': 1.2969449722837091e-06, 'epoch': 7.42}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.29it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.55it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.67it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.27it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.05it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.91it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.82it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.77it/s][A
  7%|▋         | 10/147 [00:01<00:29,  4.72it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.69it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.67it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.66it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.66it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.65it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.65it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.65it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.65it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.65it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.06045401096344, 'eval_accuracy': 0.5936900017545179, 'eval_runtime': 34.963, 'eval_samples_per_second': 66.928, 'eval_steps_per_second': 4.204, 'epoch': 7.42}


[INFO|modeling_utils.py:4297] 2025-09-05 09:12:58,283 >> Model weights saved in fv1_model_chunked/checkpoint-108500/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 09:12:58,287 >> tokenizer config file saved in fv1_model_chunked/checkpoint-108500/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 09:12:58,289 >> Special tokens file saved in fv1_model_chunked/checkpoint-108500/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 09:13:04,321 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-98500] due to args.save_total_limit
 93%|█████████▎| 109000/117000 [5:36:10<1:31:41,  1.45it/s][INFO|trainer.py:4623] 2025-09-05 09:18:48,160 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 09:18:48,160 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 09:18:48,160 >>   Batch size = 16


{'loss': 1.0595, 'grad_norm': 0.11024529486894608, 'learning_rate': 1.1494379499070618e-06, 'epoch': 7.45}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.29it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.55it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.69it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.28it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.05it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.91it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.82it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.77it/s][A
  7%|▋         | 10/147 [00:01<00:28,  4.73it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.70it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.69it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.67it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.66it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.66it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.65it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.66it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.65it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.65it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.0604592561721802, 'eval_accuracy': 0.5937159017804179, 'eval_runtime': 35.056, 'eval_samples_per_second': 66.75, 'eval_steps_per_second': 4.193, 'epoch': 7.45}


[INFO|modeling_utils.py:4297] 2025-09-05 09:19:26,300 >> Model weights saved in fv1_model_chunked/checkpoint-109000/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 09:19:26,302 >> tokenizer config file saved in fv1_model_chunked/checkpoint-109000/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 09:19:26,304 >> Special tokens file saved in fv1_model_chunked/checkpoint-109000/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 09:19:32,874 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-99000] due to args.save_total_limit
 94%|█████████▎| 109500/117000 [5:42:39<1:26:00,  1.45it/s][INFO|trainer.py:4623] 2025-09-05 09:25:16,688 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 09:25:16,688 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 09:25:16,688 >>   Batch size = 16


{'loss': 1.0596, 'grad_norm': 0.20674121379852295, 'learning_rate': 1.0107359622350876e-06, 'epoch': 7.49}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.30it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.56it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.68it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.27it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.04it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.90it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.82it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.76it/s][A
  7%|▋         | 10/147 [00:01<00:28,  4.72it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.70it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.68it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.67it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.67it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.66it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.66it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.65it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.65it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.65it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.0604552030563354, 'eval_accuracy': 0.5936031113450468, 'eval_runtime': 34.9454, 'eval_samples_per_second': 66.962, 'eval_steps_per_second': 4.207, 'epoch': 7.49}


[INFO|modeling_utils.py:4297] 2025-09-05 09:25:54,710 >> Model weights saved in fv1_model_chunked/checkpoint-109500/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 09:25:54,712 >> tokenizer config file saved in fv1_model_chunked/checkpoint-109500/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 09:25:54,714 >> Special tokens file saved in fv1_model_chunked/checkpoint-109500/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 09:26:00,905 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-99500] due to args.save_total_limit
 94%|█████████▍| 110000/117000 [5:49:07<1:20:14,  1.45it/s][INFO|trainer.py:4623] 2025-09-05 09:31:44,809 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 09:31:44,809 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 09:31:44,809 >>   Batch size = 16


{'loss': 1.0601, 'grad_norm': 0.29571467638015747, 'learning_rate': 8.808640095086141e-07, 'epoch': 7.52}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.29it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.55it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.68it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.27it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.04it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.91it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.82it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.76it/s][A
  7%|▋         | 10/147 [00:01<00:28,  4.73it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.70it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.69it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.67it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.66it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.66it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.66it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.65it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.65it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.65it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.0604442358016968, 'eval_accuracy': 0.5936327710521259, 'eval_runtime': 34.9409, 'eval_samples_per_second': 66.97, 'eval_steps_per_second': 4.207, 'epoch': 7.52}


[INFO|modeling_utils.py:4297] 2025-09-05 09:32:22,820 >> Model weights saved in fv1_model_chunked/checkpoint-110000/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 09:32:22,824 >> tokenizer config file saved in fv1_model_chunked/checkpoint-110000/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 09:32:22,826 >> Special tokens file saved in fv1_model_chunked/checkpoint-110000/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 09:32:29,319 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-100000] due to args.save_total_limit
 94%|█████████▍| 110500/117000 [5:55:35<1:14:33,  1.45it/s][INFO|trainer.py:4623] 2025-09-05 09:38:13,209 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 09:38:13,209 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 09:38:13,209 >>   Batch size = 16


{'loss': 1.0597, 'grad_norm': 0.19690418243408203, 'learning_rate': 7.59845500405082e-07, 'epoch': 7.56}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.29it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.56it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.69it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.28it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.05it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.91it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.82it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.77it/s][A
  7%|▋         | 10/147 [00:01<00:28,  4.73it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.70it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.68it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.67it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.66it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.66it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.65it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.66it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.65it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.65it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.0604504346847534, 'eval_accuracy': 0.5936181500697629, 'eval_runtime': 34.9836, 'eval_samples_per_second': 66.888, 'eval_steps_per_second': 4.202, 'epoch': 7.56}


[INFO|modeling_utils.py:4297] 2025-09-05 09:38:51,269 >> Model weights saved in fv1_model_chunked/checkpoint-110500/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 09:38:51,272 >> tokenizer config file saved in fv1_model_chunked/checkpoint-110500/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 09:38:51,274 >> Special tokens file saved in fv1_model_chunked/checkpoint-110500/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 09:38:57,509 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-100500] due to args.save_total_limit
 95%|█████████▍| 111000/117000 [6:02:03<1:08:43,  1.46it/s][INFO|trainer.py:4623] 2025-09-05 09:44:41,445 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 09:44:41,445 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 09:44:41,445 >>   Batch size = 16


{'loss': 1.0595, 'grad_norm': 0.2349674552679062, 'learning_rate': 6.477022478193317e-07, 'epoch': 7.59}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.28it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.55it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.68it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.27it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.05it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.92it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.83it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.77it/s][A
  7%|▋         | 10/147 [00:01<00:28,  4.73it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.71it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.69it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.68it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.66it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.66it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.65it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.65it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.65it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.65it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.0604395866394043, 'eval_accuracy': 0.5937275985663082, 'eval_runtime': 34.9925, 'eval_samples_per_second': 66.872, 'eval_steps_per_second': 4.201, 'epoch': 7.59}


[INFO|modeling_utils.py:4297] 2025-09-05 09:45:19,513 >> Model weights saved in fv1_model_chunked/checkpoint-111000/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 09:45:19,518 >> tokenizer config file saved in fv1_model_chunked/checkpoint-111000/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 09:45:19,520 >> Special tokens file saved in fv1_model_chunked/checkpoint-111000/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 09:45:26,093 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-101000] due to args.save_total_limit
 95%|█████████▌| 111500/117000 [6:08:32<1:03:01,  1.45it/s][INFO|trainer.py:4623] 2025-09-05 09:51:09,942 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 09:51:09,942 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 09:51:09,942 >>   Batch size = 16


{'loss': 1.06, 'grad_norm': 0.15959298610687256, 'learning_rate': 5.444544649319261e-07, 'epoch': 7.62}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.28it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.56it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.68it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.27it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.05it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.91it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.82it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.76it/s][A
  7%|▋         | 10/147 [00:01<00:29,  4.72it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.70it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.68it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.67it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.66it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.66it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.65it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.65it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.65it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.65it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.060437560081482, 'eval_accuracy': 0.5940045617464972, 'eval_runtime': 34.9731, 'eval_samples_per_second': 66.909, 'eval_steps_per_second': 4.203, 'epoch': 7.62}


[INFO|modeling_utils.py:4297] 2025-09-05 09:51:47,993 >> Model weights saved in fv1_model_chunked/checkpoint-111500/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 09:51:47,997 >> tokenizer config file saved in fv1_model_chunked/checkpoint-111500/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 09:51:47,999 >> Special tokens file saved in fv1_model_chunked/checkpoint-111500/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 09:51:54,054 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-101500] due to args.save_total_limit
 96%|█████████▌| 112000/117000 [6:15:00<57:17,  1.45it/s][INFO|trainer.py:4623] 2025-09-05 09:57:37,840 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 09:57:37,840 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 09:57:37,840 >>   Batch size = 16


{'loss': 1.0593, 'grad_norm': 0.12756748497486115, 'learning_rate': 4.501207615658254e-07, 'epoch': 7.66}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.30it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.56it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.68it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.28it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.05it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.91it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.83it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.77it/s][A
  7%|▋         | 10/147 [00:01<00:28,  4.73it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.71it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.69it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.68it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.67it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.66it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.65it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.65it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.66it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.65it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.0604372024536133, 'eval_accuracy': 0.5937601824698598, 'eval_runtime': 34.9683, 'eval_samples_per_second': 66.918, 'eval_steps_per_second': 4.204, 'epoch': 7.66}


[INFO|modeling_utils.py:4297] 2025-09-05 09:58:15,879 >> Model weights saved in fv1_model_chunked/checkpoint-112000/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 09:58:15,884 >> tokenizer config file saved in fv1_model_chunked/checkpoint-112000/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 09:58:15,886 >> Special tokens file saved in fv1_model_chunked/checkpoint-112000/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 09:58:21,958 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-102000] due to args.save_total_limit
 96%|█████████▌| 112500/117000 [6:21:28<51:33,  1.45it/s][INFO|trainer.py:4623] 2025-09-05 10:04:05,712 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 10:04:05,713 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 10:04:05,713 >>   Batch size = 16


{'loss': 1.0598, 'grad_norm': 0.21197138726711273, 'learning_rate': 3.6471814083208766e-07, 'epoch': 7.69}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.30it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.56it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.68it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.27it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.05it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.91it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.82it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.76it/s][A
  7%|▋         | 10/147 [00:01<00:28,  4.73it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.70it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.69it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.67it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.66it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.66it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.65it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.65it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.65it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.65it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.0604381561279297, 'eval_accuracy': 0.5935818064850323, 'eval_runtime': 35.0682, 'eval_samples_per_second': 66.727, 'eval_steps_per_second': 4.192, 'epoch': 7.69}


[INFO|modeling_utils.py:4297] 2025-09-05 10:04:43,853 >> Model weights saved in fv1_model_chunked/checkpoint-112500/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 10:04:43,857 >> tokenizer config file saved in fv1_model_chunked/checkpoint-112500/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 10:04:43,858 >> Special tokens file saved in fv1_model_chunked/checkpoint-112500/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 10:04:50,417 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-102500] due to args.save_total_limit
 97%|█████████▋| 113000/117000 [6:27:56<45:49,  1.45it/s][INFO|trainer.py:4623] 2025-09-05 10:10:34,155 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 10:10:34,155 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 10:10:34,155 >>   Batch size = 16


{'loss': 1.0596, 'grad_norm': 0.1766732782125473, 'learning_rate': 2.8826199606516447e-07, 'epoch': 7.73}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.29it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.55it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.68it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.27it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.04it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.91it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.83it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.77it/s][A
  7%|▋         | 10/147 [00:01<00:28,  4.73it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.70it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.68it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.67it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.66it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.66it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.65it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.65it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.65it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.64it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.0604362487792969, 'eval_accuracy': 0.5935546532320726, 'eval_runtime': 34.9943, 'eval_samples_per_second': 66.868, 'eval_steps_per_second': 4.201, 'epoch': 7.73}


[INFO|modeling_utils.py:4297] 2025-09-05 10:11:12,565 >> Model weights saved in fv1_model_chunked/checkpoint-113000/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 10:11:12,568 >> tokenizer config file saved in fv1_model_chunked/checkpoint-113000/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 10:11:12,570 >> Special tokens file saved in fv1_model_chunked/checkpoint-113000/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 10:11:19,057 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-103000] due to args.save_total_limit
 97%|█████████▋| 113500/117000 [6:34:25<40:08,  1.45it/s][INFO|trainer.py:4623] 2025-09-05 10:17:02,943 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 10:17:02,943 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 10:17:02,943 >>   Batch size = 16


{'loss': 1.0599, 'grad_norm': 0.170184925198555, 'learning_rate': 2.2076610804832032e-07, 'epoch': 7.76}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.33it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.56it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.69it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.27it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.04it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.91it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.82it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.77it/s][A
  7%|▋         | 10/147 [00:01<00:28,  4.73it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.70it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.68it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.67it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.66it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.66it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.66it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.65it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.65it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.64it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.0604344606399536, 'eval_accuracy': 0.5935855661662113, 'eval_runtime': 34.9752, 'eval_samples_per_second': 66.904, 'eval_steps_per_second': 4.203, 'epoch': 7.76}


[INFO|modeling_utils.py:4297] 2025-09-05 10:17:41,000 >> Model weights saved in fv1_model_chunked/checkpoint-113500/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 10:17:41,004 >> tokenizer config file saved in fv1_model_chunked/checkpoint-113500/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 10:17:41,006 >> Special tokens file saved in fv1_model_chunked/checkpoint-113500/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 10:17:47,300 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-103500] due to args.save_total_limit
 97%|█████████▋| 114000/117000 [6:40:53<34:24,  1.45it/s][INFO|trainer.py:4623] 2025-09-05 10:23:31,320 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 10:23:31,320 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 10:23:31,320 >>   Batch size = 16


{'loss': 1.0594, 'grad_norm': 0.20238178968429565, 'learning_rate': 1.6224264252973076e-07, 'epoch': 7.79}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.28it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.55it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.67it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.26it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.04it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.90it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.82it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.77it/s][A
  7%|▋         | 10/147 [00:01<00:28,  4.73it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.70it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.68it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.67it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.66it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.65it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.65it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.65it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.65it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.64it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.0604329109191895, 'eval_accuracy': 0.5935826419697388, 'eval_runtime': 35.0566, 'eval_samples_per_second': 66.749, 'eval_steps_per_second': 4.193, 'epoch': 7.79}


[INFO|modeling_utils.py:4297] 2025-09-05 10:24:09,457 >> Model weights saved in fv1_model_chunked/checkpoint-114000/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 10:24:09,461 >> tokenizer config file saved in fv1_model_chunked/checkpoint-114000/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 10:24:09,462 >> Special tokens file saved in fv1_model_chunked/checkpoint-114000/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 10:24:15,598 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-104000] due to args.save_total_limit
 98%|█████████▊| 114500/117000 [6:47:21<28:38,  1.45it/s][INFO|trainer.py:4623] 2025-09-05 10:29:59,434 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 10:29:59,434 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 10:29:59,434 >>   Batch size = 16


{'loss': 1.0596, 'grad_norm': 0.18853360414505005, 'learning_rate': 1.1270214802969193e-07, 'epoch': 7.83}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.30it/s][A
  2%|▏         | 3/147 [00:00<00:22,  6.54it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.68it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.27it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.05it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.91it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.82it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.76it/s][A
  7%|▋         | 10/147 [00:01<00:29,  4.72it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.69it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.68it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.66it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.66it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.65it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.65it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.65it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.65it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.65it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.0604352951049805, 'eval_accuracy': 0.5935224870708742, 'eval_runtime': 35.0073, 'eval_samples_per_second': 66.843, 'eval_steps_per_second': 4.199, 'epoch': 7.83}


[INFO|modeling_utils.py:4297] 2025-09-05 10:30:37,513 >> Model weights saved in fv1_model_chunked/checkpoint-114500/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 10:30:37,517 >> tokenizer config file saved in fv1_model_chunked/checkpoint-114500/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 10:30:37,520 >> Special tokens file saved in fv1_model_chunked/checkpoint-114500/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 10:30:43,656 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-104500] due to args.save_total_limit
 98%|█████████▊| 115000/117000 [6:53:49<22:55,  1.45it/s][INFO|trainer.py:4623] 2025-09-05 10:36:27,534 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 10:36:27,535 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 10:36:27,535 >>   Batch size = 16


{'loss': 1.0598, 'grad_norm': 0.26955127716064453, 'learning_rate': 7.215355393928592e-08, 'epoch': 7.86}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.29it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.55it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.68it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.28it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.05it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.91it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.82it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.76it/s][A
  7%|▋         | 10/147 [00:01<00:29,  4.72it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.70it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.68it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.67it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.66it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.66it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.65it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.65it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.66it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.65it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.0604318380355835, 'eval_accuracy': 0.5937480679416163, 'eval_runtime': 35.0543, 'eval_samples_per_second': 66.754, 'eval_steps_per_second': 4.193, 'epoch': 7.86}


[INFO|modeling_utils.py:4297] 2025-09-05 10:37:05,668 >> Model weights saved in fv1_model_chunked/checkpoint-115000/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 10:37:05,673 >> tokenizer config file saved in fv1_model_chunked/checkpoint-115000/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 10:37:05,675 >> Special tokens file saved in fv1_model_chunked/checkpoint-115000/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 10:37:11,834 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-105000] due to args.save_total_limit
 99%|█████████▊| 115500/117000 [7:00:18<17:11,  1.45it/s][INFO|trainer.py:4623] 2025-09-05 10:42:55,660 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 10:42:55,661 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 10:42:55,661 >>   Batch size = 16


{'loss': 1.0592, 'grad_norm': 0.1567329317331314, 'learning_rate': 4.060416891092378e-08, 'epoch': 7.9}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.32it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.57it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.69it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.28it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.05it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.91it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.83it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.77it/s][A
  7%|▋         | 10/147 [00:01<00:28,  4.73it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.70it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.68it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.67it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.66it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.66it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.66it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.65it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.65it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.65it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.0604314804077148, 'eval_accuracy': 0.5936540759121405, 'eval_runtime': 34.9955, 'eval_samples_per_second': 66.866, 'eval_steps_per_second': 4.201, 'epoch': 7.9}


[INFO|modeling_utils.py:4297] 2025-09-05 10:43:33,735 >> Model weights saved in fv1_model_chunked/checkpoint-115500/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 10:43:33,737 >> tokenizer config file saved in fv1_model_chunked/checkpoint-115500/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 10:43:33,738 >> Special tokens file saved in fv1_model_chunked/checkpoint-115500/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 10:43:40,286 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-105500] due to args.save_total_limit
 99%|█████████▉| 116000/117000 [7:06:46<11:28,  1.45it/s][INFO|trainer.py:4623] 2025-09-05 10:49:24,148 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 10:49:24,148 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 10:49:24,148 >>   Batch size = 16


{'loss': 1.0596, 'grad_norm': 0.2833642065525055, 'learning_rate': 1.80596795410104e-08, 'epoch': 7.93}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.33it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.56it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.68it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.27it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.04it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.91it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.82it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.76it/s][A
  7%|▋         | 10/147 [00:01<00:28,  4.73it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.70it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.68it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.67it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.66it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.66it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.66it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.65it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.65it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.65it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.0604311227798462, 'eval_accuracy': 0.5938579341805148, 'eval_runtime': 35.3321, 'eval_samples_per_second': 66.229, 'eval_steps_per_second': 4.161, 'epoch': 7.93}


[INFO|modeling_utils.py:4297] 2025-09-05 10:50:02,569 >> Model weights saved in fv1_model_chunked/checkpoint-116000/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 10:50:02,573 >> tokenizer config file saved in fv1_model_chunked/checkpoint-116000/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 10:50:02,575 >> Special tokens file saved in fv1_model_chunked/checkpoint-116000/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 10:50:08,705 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-106000] due to args.save_total_limit
100%|█████████▉| 116500/117000 [7:13:14<05:43,  1.45it/s][INFO|trainer.py:4623] 2025-09-05 10:55:52,627 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 10:55:52,627 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 10:55:52,627 >>   Batch size = 16


{'loss': 1.0597, 'grad_norm': 0.14142785966396332, 'learning_rate': 4.524149344942164e-09, 'epoch': 7.97}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.30it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.56it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.68it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.27it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.04it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.91it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.82it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.76it/s][A
  7%|▋         | 10/147 [00:01<00:28,  4.73it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.70it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.68it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.67it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.66it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.66it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.66it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.65it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.65it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.65it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.060430884361267, 'eval_accuracy': 0.5937275985663082, 'eval_runtime': 34.9998, 'eval_samples_per_second': 66.858, 'eval_steps_per_second': 4.2, 'epoch': 7.97}


[INFO|modeling_utils.py:4297] 2025-09-05 10:56:30,964 >> Model weights saved in fv1_model_chunked/checkpoint-116500/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 10:56:30,970 >> tokenizer config file saved in fv1_model_chunked/checkpoint-116500/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 10:56:30,973 >> Special tokens file saved in fv1_model_chunked/checkpoint-116500/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 10:56:37,640 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-106500] due to args.save_total_limit
100%|██████████| 117000/117000 [7:19:43<00:00,  1.46it/s][INFO|trainer.py:4623] 2025-09-05 11:02:21,609 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 11:02:21,609 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 11:02:21,609 >>   Batch size = 16


{'loss': 1.0594, 'grad_norm': 0.16533973813056946, 'learning_rate': 1.8024698400509467e-14, 'epoch': 8.0}



  0%|          | 0/147 [00:00<?, ?it/s][A
  1%|▏         | 2/147 [00:00<00:15,  9.29it/s][A
  2%|▏         | 3/147 [00:00<00:21,  6.55it/s][A
  3%|▎         | 4/147 [00:00<00:25,  5.68it/s][A
  3%|▎         | 5/147 [00:00<00:26,  5.27it/s][A
  4%|▍         | 6/147 [00:01<00:27,  5.04it/s][A
  5%|▍         | 7/147 [00:01<00:28,  4.91it/s][A
  5%|▌         | 8/147 [00:01<00:28,  4.82it/s][A
  6%|▌         | 9/147 [00:01<00:28,  4.77it/s][A
  7%|▋         | 10/147 [00:01<00:28,  4.73it/s][A
  7%|▋         | 11/147 [00:02<00:28,  4.70it/s][A
  8%|▊         | 12/147 [00:02<00:28,  4.69it/s][A
  9%|▉         | 13/147 [00:02<00:28,  4.67it/s][A
 10%|▉         | 14/147 [00:02<00:28,  4.66it/s][A
 10%|█         | 15/147 [00:03<00:28,  4.66it/s][A
 11%|█         | 16/147 [00:03<00:28,  4.65it/s][A
 12%|█▏        | 17/147 [00:03<00:27,  4.65it/s][A
 12%|█▏        | 18/147 [00:03<00:27,  4.65it/s][A
 13%|█▎        | 19/147 [00:03<00:27,  4.64it/s][A
 14%|█▎        | 20/147 [00:

{'eval_loss': 1.060430884361267, 'eval_accuracy': 0.5937033695098212, 'eval_runtime': 35.6459, 'eval_samples_per_second': 65.646, 'eval_steps_per_second': 4.124, 'epoch': 8.0}


[INFO|modeling_utils.py:4297] 2025-09-05 11:03:03,330 >> Model weights saved in fv1_model_chunked/checkpoint-117000/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 11:03:03,351 >> tokenizer config file saved in fv1_model_chunked/checkpoint-117000/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 11:03:03,365 >> Special tokens file saved in fv1_model_chunked/checkpoint-117000/special_tokens_map.json
[INFO|trainer.py:4398] 2025-09-05 11:03:15,482 >> Deleting older checkpoint [fv1_model_chunked/checkpoint-107000] due to args.save_total_limit
[INFO|trainer.py:2808] 2025-09-05 11:03:15,627 >> 

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 117000/117000 [7:20:38<00:00,  4.43it/s]
[INFO|trainer.py:4289] 2025-09-05 11:03:15,656 >> Saving model checkpoint to fv1_model_chunked


{'train_runtime': 26439.9654, 'train_samples_per_second': 70.802, 'train_steps_per_second': 4.425, 'train_loss': 0.30346304034371663, 'epoch': 8.0}


[INFO|configuration_utils.py:491] 2025-09-05 11:03:15,692 >> Configuration saved in fv1_model_chunked/config.json
[INFO|configuration_utils.py:826] 2025-09-05 11:03:15,720 >> Configuration saved in fv1_model_chunked/generation_config.json
[INFO|modeling_utils.py:4297] 2025-09-05 11:03:21,511 >> Model weights saved in fv1_model_chunked/model.safetensors
[INFO|tokenization_utils_base.py:2563] 2025-09-05 11:03:21,542 >> tokenizer config file saved in fv1_model_chunked/tokenizer_config.json
[INFO|tokenization_utils_base.py:2572] 2025-09-05 11:03:21,564 >> Special tokens file saved in fv1_model_chunked/special_tokens_map.json


***** train metrics *****
  epoch                    =         8.0
  total_flos               = 911091796GF
  train_loss               =      0.3035
  train_runtime            =  7:20:39.96
  train_samples            =      234000
  train_samples_per_second =      70.802
  train_steps_per_second   =       4.425
09/05/2025 11:03:21 - INFO - __main__ - *** Evaluate ***


[INFO|trainer.py:4623] 2025-09-05 11:03:21,729 >> 
***** Running Evaluation *****
[INFO|trainer.py:4625] 2025-09-05 11:03:21,730 >>   Num examples = 2340
[INFO|trainer.py:4628] 2025-09-05 11:03:21,730 >>   Batch size = 16
100%|██████████| 147/147 [00:35<00:00,  4.17it/s]
[INFO|modelcard.py:456] 2025-09-05 11:03:57,357 >> Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}, 'metrics': [{'name': 'Accuracy', 'type': 'accuracy', 'value': 0.5937033695098212}]}


***** eval metrics *****
  epoch                   =        8.0
  eval_accuracy           =     0.5937
  eval_loss               =     1.0604
  eval_runtime            = 0:00:35.52
  eval_samples            =       2340
  eval_samples_per_second =     65.868
  eval_steps_per_second   =      4.138
  perplexity              =     2.8876
[1;34mwandb[0m: 
[1;34mwandb[0m: 🚀 View run [33mfv1_model_chunked[0m at: [34mhttps://wandb.ai/carobgt-ucl/foraging_models/runs/ci75uvds[0m
[1;34mwandb[0m: Find logs at: [1;35mwandb_local_runs/wandb/run-20250905_034235-ci75uvds/logs[0m
