## Setting up

In [1]:
%%capture
# Install required libraries
!pip install git+https://github.com/huggingface/transformers@v4.49.0-Gemma-3

In [2]:
%%capture
# Install required libraries
%pip install -U datasets 
%pip install -U accelerate 
%pip install -U peft 
%pip install -U trl 
%pip install -U bitsandbytes

In [3]:
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
# Login in to huggingface for access to the pre trained model
hf_token = user_secrets.get_secret("HUGGINGFACE_TOKEN")
login(hf_token)

In [4]:
# Wandb setup parameters (for hyperparameter sweeps and run reporting)
%env WANDB_PROJECT=HPML_Gemma
%env WANDB_LOG_MODEL=false
%env WANDB_WATCH=false

env: WANDB_PROJECT=HPML_Gemma
env: WANDB_LOG_MODEL=false
env: WANDB_WATCH=false


In [5]:
import wandb
wandb.login()

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33maditya-nyu[0m ([33maditya-nyu-hpml[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

## Loading the model and tokenizer

In [7]:
from transformers import AutoTokenizer, Gemma3ForConditionalGeneration, BitsAndBytesConfig, TrainingArguments
import kagglehub
import torch
from transformers import AutoTokenizer
from transformers.models.gemma3 import Gemma3ForCausalLM
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
    PeftModelForCausalLM
)

device = "cuda" if torch.cuda.is_available() else "cpu"
# Load pre trained model from Kaggle
GEMMA_PATH = kagglehub.model_download("google/gemma-3/transformers/gemma-3-1b-it")
# Quantization configuration (4bit)
bnb_config = BitsAndBytesConfig(
  load_in_4bit=True,
  bnb_4bit_quant_type="nf4",
  bnb_4bit_compute_dtype=torch.float16,
)
# Create quantized model
model = Gemma3ForCausalLM.from_pretrained(GEMMA_PATH, device_map="auto", attn_implementation='eager', quantization_config=bnb_config).eval()
print(model)
# Create tokenizer for using prompt interface
tokenizer = AutoTokenizer.from_pretrained(GEMMA_PATH)

Gemma3ForCausalLM(
  (model): Gemma3TextModel(
    (embed_tokens): Gemma3TextScaledWordEmbedding(262144, 1152, padding_idx=0)
    (layers): ModuleList(
      (0-25): 26 x Gemma3DecoderLayer(
        (self_attn): Gemma3Attention(
          (q_proj): Linear(in_features=1152, out_features=1024, bias=False)
          (k_proj): Linear(in_features=1152, out_features=256, bias=False)
          (v_proj): Linear(in_features=1152, out_features=256, bias=False)
          (o_proj): Linear(in_features=1024, out_features=1152, bias=False)
          (q_norm): Gemma3RMSNorm((256,), eps=1e-06)
          (k_norm): Gemma3RMSNorm((256,), eps=1e-06)
        )
        (mlp): Gemma3MLP(
          (gate_proj): Linear(in_features=1152, out_features=6912, bias=False)
          (up_proj): Linear(in_features=1152, out_features=6912, bias=False)
          (down_proj): Linear(in_features=6912, out_features=1152, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): Gemma3RMSNorm((11

## Loading and processing the dataset

In [8]:
# Prompt structure
train_prompt_style="""
Below is an instruction that describes a task, paired with an input that provides further context. 
Write a response that appropriately completes the request. 
Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.

### Question:
{}

### Response:
<think>
{}
</think>
{}
"""

In [10]:
# Response formatting helper function
def formatting_prompts_func(examples):
    inputs = examples["Open-ended Verifiable Question"]
    complex_cots = examples["Complex_CoT"]
    outputs = examples["Response"]
    texts = []
    for question, cot, response in zip(inputs, complex_cots, outputs):
        # Append the EOS token to the response if it's not already there
        if not response.endswith(tokenizer.eos_token):
            response += tokenizer.eos_token
        text = train_prompt_style.format(question, cot, response)
        texts.append(text)
    return {"text": texts}

In [11]:
%%time
# Load and preprocess dataset
from datasets import load_dataset
dataset = load_dataset("TheFinAI/Fino1_Reasoning_Path_FinQA", split = "train[0:500]",trust_remote_code=True)
dataset = dataset.map(formatting_prompts_func, batched = True,)
dataset["text"][0]

README.md:   0%|          | 0.00/1.32k [00:00<?, ?B/s]

train.parquet:   0%|          | 0.00/14.8M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5499 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

CPU times: user 1.02 s, sys: 229 ms, total: 1.25 s
Wall time: 5.49 s


"\nBelow is an instruction that describes a task, paired with an input that provides further context. \nWrite a response that appropriately completes the request. \nBefore answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.\n\n### Question:\nPlease answer the given financial question based on the context.\nContext: amortization expense , which is included in selling , general and administrative expenses , was $ 13.0 million , $ 13.9 million and $ 8.5 million for the years ended december 31 , 2016 , 2015 and 2014 , respectively . the following is the estimated amortization expense for the company 2019s intangible assets as of december 31 , 2016 : ( in thousands ) .\n|2017|$ 10509|\n|2018|9346|\n|2019|9240|\n|2020|7201|\n|2021|5318|\n|2022 and thereafter|16756|\n|amortization expense of intangible assets|$ 58370|\nat december 31 , 2016 , 2015 and 2014 , the company determined that its goodwill and indefinite

In [13]:
from transformers import DataCollatorForLanguageModeling
# Process dataset for training and evaluation
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

## Model inference before fine-tuning

In [14]:
# Response prompt structure
prompt_style = """Below is an instruction that describes a task, paired with an input that provides further context. 
Write a response that appropriately completes the request. 
Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.

### Question:
{}

### Response:
<think>
{}
"""

In [12]:
from torch.profiler import profile, record_function, ProfilerActivity


In [15]:
%%time
question = dataset[0]['Open-ended Verifiable Question']
# Generate a response to a question
print("Question: ", question)
# Tokenize question and move to device
inputs = tokenizer(
    [prompt_style.format(question, "") + tokenizer.eos_token],
    return_tensors="pt"
).to("cuda")
# Timing events
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
start_event.record()
# Forward pass
outputs = model.generate(
            input_ids=inputs.input_ids,
            attention_mask=inputs.attention_mask,
            max_new_tokens=1200,
            eos_token_id=tokenizer.eos_token_id,
            use_cache=True,
        )
end_event.record()
torch.cuda.synchronize()  # Wait for the events to be recorded!
elapsed_time_ms = start_event.elapsed_time(end_event)
# Decode and display response
response = tokenizer.batch_decode(outputs, skip_special_tokens=True)

print(response[0].split("### Response:")[1])

print("GPU Time:",elapsed_time_ms)


Question:  Please answer the given financial question based on the context.
Context: amortization expense , which is included in selling , general and administrative expenses , was $ 13.0 million , $ 13.9 million and $ 8.5 million for the years ended december 31 , 2016 , 2015 and 2014 , respectively . the following is the estimated amortization expense for the company 2019s intangible assets as of december 31 , 2016 : ( in thousands ) .
|2017|$ 10509|
|2018|9346|
|2019|9240|
|2020|7201|
|2021|5318|
|2022 and thereafter|16756|
|amortization expense of intangible assets|$ 58370|
at december 31 , 2016 , 2015 and 2014 , the company determined that its goodwill and indefinite- lived intangible assets were not impaired . 5 . credit facility and other long term debt credit facility the company is party to a credit agreement that provides revolving commitments for up to $ 1.25 billion of borrowings , as well as term loan commitments , in each case maturing in january 2021 . as of december 31 ,

In [16]:
print("GPU Time:",elapsed_time_ms/1000, " s")

GPU Time: 68.91490625  s


## Setting up the model

In [17]:
%%time
from trl import SFTTrainer
from transformers import TrainingArguments

# LoRA Configuration
peft_config = LoraConfig(
    lora_alpha=16,                           # Scaling factor for LoRA
    lora_dropout=0.05,                       # Add slight dropout for regularization
    r=64,                                    # Rank of the LoRA update matrices
    bias="none",                             # No bias reparameterization
    task_type="CAUSAL_LM",                   # Task type: Causal Language Modeling
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],  # Target modules for LoRA
)


# Training Arguments
training_arguments = TrainingArguments(
    output_dir="output",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=1,
    logging_steps=5,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    report_to="none",
    max_steps=50
)

# Initialize the Trainer
trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=dataset,
    peft_config=peft_config,
    data_collator=data_collator,
)

Converting train dataset to ChatML:   0%|          | 0/500 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


CPU times: user 6.23 s, sys: 465 ms, total: 6.7 s
Wall time: 6.29 s


## Model training

In [18]:
# Clear the cache and train the adapters
torch.cuda.empty_cache()
trainer_stats = trainer.train()

Step,Training Loss
5,2.9976
10,2.6183
15,2.3433
20,2.1465
25,2.1578
30,1.9158
35,1.9981
40,2.0306
45,1.9876
50,1.6141


## Model inference after fine-tuning

In [19]:
%%time
question = dataset[0]['Open-ended Verifiable Question']
# Prepare another question for testing
inputs = tokenizer(
    [prompt_style.format(question, "") + tokenizer.eos_token],
    return_tensors="pt"
).to("cuda")



CPU times: user 4.85 ms, sys: 963 µs, total: 5.81 ms
Wall time: 4.79 ms


In [20]:
%%time
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
start_event.record()
# Forward pass
outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    max_new_tokens=1200,
    eos_token_id=tokenizer.eos_token_id,
    use_cache=True,
)

end_event.record()
torch.cuda.synchronize()  # Wait for the events to be recorded!
elapsed_time_ms = start_event.elapsed_time(end_event)

response = tokenizer.batch_decode(outputs, skip_special_tokens=True)

print(response[0].split("### Response:")[1])

print("GPU Time:",elapsed_time_ms)


<think>

think:
Alright, let's dive into this amortization expense situation. The question asks about the portion of the estimated amortization expense that will be recognized in 2017. First, we need to figure out what the amortization expense is for 2017. The context states that the amortization expense for 2017 was $ 13.0 million.

Now, let's see what the amortization expense was for 2016. The context says that the amortization expense for 2016 was $ 13.9 million.

Next, we need to see how much of the amortization expense was recognized in 2016. The context says that the amortization expense for 2016 was $ 8.5 million.

Now, let's see how much of the amortization expense was recognized in 2017. The context says that the amortization expense for 2017 was $ 13.0 million.

So, to find out how much of the amortization expense was recognized in 2017, we simply need to subtract the amortization expense for 2016 from the amortization expense for 2017.

Let's do the subtraction.

Subtractin

### Test with another question

In [30]:
%%time
question = dataset[10]['Open-ended Verifiable Question']

inputs = tokenizer(
    [prompt_style.format(question, "") + tokenizer.eos_token],
    return_tensors="pt"
).to("cuda")

outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    max_new_tokens=1200,
    eos_token_id=tokenizer.eos_token_id,
    use_cache=True,
)
response = tokenizer.batch_decode(outputs, skip_special_tokens=True)
print(response[0].split("### Response:")[1])


<think>


The question asks for the total purchase price of impella cardiosystems, considering all contingent payments. The provided text doesn't explicitly state the amount of the contingent payments. However, we can infer the following:

*   **Cash Payments:** The text mentions a $5.6 million contingent payment from January 30, 2007, and a potential additional $1.8 million in cash payments.
*   **Stock Payments:** The text mentions $42.2 million of common stock and $1.6 million of cash paid to former shareholders.
*   **Transaction Costs & Legal Fees:** There's a $1.3 million cost for financial advisory and legal services.
*   **Fasb Interpretation:** The disclosure requirements for "guarantees of indebtedness of others" and "interpretation of fasb statements no . 5 , 57 and 107" suggest potential future contingent payments.

Considering these factors, the total purchase price would likely be around **$45.1 million + $1.3 million + $5.6 million + $1.8 million = $67.5 million**.

The

## Saving the model locally

In [31]:
new_model_local = "Gemma-3-1B-Fin-QA-Reasoning"
model.save_pretrained(new_model_local) # Local saving
tokenizer.save_pretrained(new_model_local)

('Gemma-3-1B-Fin-QA-Reasoning/tokenizer_config.json',
 'Gemma-3-1B-Fin-QA-Reasoning/special_tokens_map.json',
 'Gemma-3-1B-Fin-QA-Reasoning/tokenizer.model',
 'Gemma-3-1B-Fin-QA-Reasoning/added_tokens.json',
 'Gemma-3-1B-Fin-QA-Reasoning/tokenizer.json')

## Wandb Sweep

In [18]:
# Similar to main training, overload function wandb config instead of pre defined
from trl import SFTTrainer
from transformers import TrainingArguments

def configure_trainer(config=None):
    with wandb.init(config=config):
        config = wandb.config
        # LoRA Configuration
        peft_config = LoraConfig(
            lora_alpha=16,                           # Scaling factor for LoRA
            lora_dropout=0.1,                       # Add slight dropout for regularization
            r=config.rank,                                    # Rank of the LoRA update matrices
            bias="none",                             # No bias reparameterization
            task_type="CAUSAL_LM",                   # Task type: Causal Language Modeling
            target_modules=[
                "q_proj",
                "k_proj",
                "v_proj",
                "o_proj"
            ],  # Target modules for LoRA
        )


        # Training Arguments
        training_arguments = TrainingArguments(
            output_dir="output",
            per_device_train_batch_size=1,
            per_device_eval_batch_size=1,
            gradient_accumulation_steps=2,
            optim="paged_adamw_32bit",
            num_train_epochs=config.epochs,
            logging_steps=5,
            warmup_steps=10,
            logging_strategy="steps",
            learning_rate=config.lr,
            fp16=False,
            bf16=False,
            group_by_length=True,
            report_to="wandb",
            max_steps=50
        )

        # Initialize the Trainer
        trainer = SFTTrainer(
            model=model,
            args=training_arguments,
            train_dataset=dataset,
            peft_config=peft_config,
            data_collator=data_collator,
        )
        torch.cuda.empty_cache()
        trainer_stats = trainer.train()

In [19]:
# Wandb sweep parameters
sweep_configuration = {
    "method": "random",
    "metric": {"goal": "minimize", "name": "loss"},
    "parameters": {
        "epochs":{"values": [1, 2]},
        "lr":{"values":[1e-3, 1e-4, 5e-4]},
        "rank":{"values":[4, 16]}
    },
}
sweep_id = wandb.sweep(sweep=sweep_configuration, project="HPML_Gemma")
wandb.agent(sweep_id, function=configure_trainer, count=6)

Create sweep with ID: 084yepwg
Sweep URL: https://wandb.ai/aditya-nyu-hpml/HPML_Gemma/sweeps/084yepwg


[34m[1mwandb[0m: Agent Starting Run: 9o4a0h6d with config:
[34m[1mwandb[0m: 	epochs: 2
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	rank: 16


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
5,3.0648
10,2.5423
15,2.299
20,2.1062
25,2.118
30,1.9061
35,2.0015
40,2.0568
45,2.0141
50,1.6464


0,1
train/epoch,▁▂▃▃▄▅▆▆▇██
train/global_step,▁▂▃▃▄▅▆▆▇██
train/grad_norm,█▄▃▂▂▄▁▁▁▂
train/learning_rate,▅█▇▆▅▅▄▃▂▁
train/loss,█▅▄▃▃▂▃▃▃▁
train/mean_token_accuracy,▁▃▄▅▅▆▆▆▆█
train/num_tokens,▁▂▃▃▄▅▆▇▇█

0,1
total_flos,423970885456128.0
train/epoch,0.2
train/global_step,50.0
train/grad_norm,0.52628
train/learning_rate,0.0
train/loss,1.6464
train/mean_token_accuracy,0.64073
train/num_tokens,100819.0
train_loss,2.17552
train_runtime,76.8296


[34m[1mwandb[0m: Agent Starting Run: o0e2funi with config:
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	lr: 0.0005
[34m[1mwandb[0m: 	rank: 16


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
5,3.1082
10,2.6342
15,2.4086
20,2.2406
25,2.2537
30,2.0045
35,2.08
40,2.1149
45,2.0648
50,1.7387


0,1
train/epoch,▁▂▃▃▄▅▆▆▇██
train/global_step,▁▂▃▃▄▅▆▆▇██
train/grad_norm,█▂▃▂▂▂▂▁▁▂
train/learning_rate,▅█▇▆▅▅▄▃▂▁
train/loss,█▆▄▄▄▂▃▃▃▁
train/mean_token_accuracy,▁▃▃▄▅▆▆▆▆█
train/num_tokens,▁▂▃▃▄▅▆▇▇█

0,1
total_flos,423970885456128.0
train/epoch,0.2
train/global_step,50.0
train/grad_norm,0.57369
train/learning_rate,0.0
train/loss,1.7387
train/mean_token_accuracy,0.6273
train/num_tokens,100819.0
train_loss,2.26482
train_runtime,76.7569


[34m[1mwandb[0m: Agent Starting Run: qkxsov2t with config:
[34m[1mwandb[0m: 	epochs: 2
[34m[1mwandb[0m: 	lr: 0.0005
[34m[1mwandb[0m: 	rank: 16


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
5,3.1082
10,2.6342
15,2.4086
20,2.2406
25,2.2537
30,2.0045
35,2.08
40,2.1149
45,2.0648
50,1.7387


0,1
train/epoch,▁▂▃▃▄▅▆▆▇██
train/global_step,▁▂▃▃▄▅▆▆▇██
train/grad_norm,█▂▃▂▂▂▂▁▁▂
train/learning_rate,▅█▇▆▅▅▄▃▂▁
train/loss,█▆▄▄▄▂▃▃▃▁
train/mean_token_accuracy,▁▃▃▄▅▆▆▆▆█
train/num_tokens,▁▂▃▃▄▅▆▇▇█

0,1
total_flos,423970885456128.0
train/epoch,0.2
train/global_step,50.0
train/grad_norm,0.57369
train/learning_rate,0.0
train/loss,1.7387
train/mean_token_accuracy,0.6273
train/num_tokens,100819.0
train_loss,2.26482
train_runtime,76.7586


[34m[1mwandb[0m: Agent Starting Run: m3q0rdsx with config:
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	lr: 0.0005
[34m[1mwandb[0m: 	rank: 16


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
5,3.1082
10,2.6342
15,2.4086
20,2.2406
25,2.2537
30,2.0045
35,2.08
40,2.1149
45,2.0648
50,1.7387


0,1
train/epoch,▁▂▃▃▄▅▆▆▇██
train/global_step,▁▂▃▃▄▅▆▆▇██
train/grad_norm,█▂▃▂▂▂▂▁▁▂
train/learning_rate,▅█▇▆▅▅▄▃▂▁
train/loss,█▆▄▄▄▂▃▃▃▁
train/mean_token_accuracy,▁▃▃▄▅▆▆▆▆█
train/num_tokens,▁▂▃▃▄▅▆▇▇█

0,1
total_flos,423970885456128.0
train/epoch,0.2
train/global_step,50.0
train/grad_norm,0.57369
train/learning_rate,0.0
train/loss,1.7387
train/mean_token_accuracy,0.6273
train/num_tokens,100819.0
train_loss,2.26482
train_runtime,76.7388


[34m[1mwandb[0m: Agent Starting Run: 7oi53r70 with config:
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	rank: 16


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
5,3.0626
10,2.5429
15,2.2978
20,2.1048
25,2.1167
30,1.908
35,2.0072
40,2.0601
45,2.0188
50,1.6525


0,1
train/epoch,▁▂▃▃▄▅▆▆▇██
train/global_step,▁▂▃▃▄▅▆▆▇██
train/grad_norm,█▄▃▃▂▂▁▁▁▂
train/learning_rate,▅█▇▆▅▅▄▃▂▁
train/loss,█▅▄▃▃▂▃▃▃▁
train/mean_token_accuracy,▁▃▄▅▅▆▆▆▆█
train/num_tokens,▁▂▃▃▄▅▆▇▇█

0,1
total_flos,423970885456128.0
train/epoch,0.2
train/global_step,50.0
train/grad_norm,0.50783
train/learning_rate,0.0
train/loss,1.6525
train/mean_token_accuracy,0.63846
train/num_tokens,100819.0
train_loss,2.17715
train_runtime,76.7985


[34m[1mwandb[0m: Agent Starting Run: l8k7ntdi with config:
[34m[1mwandb[0m: 	epochs: 2
[34m[1mwandb[0m: 	lr: 0.0005
[34m[1mwandb[0m: 	rank: 4


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
5,3.1138
10,2.6468
15,2.3995
20,2.2258
25,2.2256
30,1.9767
35,2.0548
40,2.102
45,2.0543
50,1.714


0,1
train/epoch,▁▂▃▃▄▅▆▆▇██
train/global_step,▁▂▃▃▄▅▆▆▇██
train/grad_norm,█▂▄▂▂▃▂▁▁▂
train/learning_rate,▅█▇▆▅▅▄▃▂▁
train/loss,█▆▄▄▄▂▃▃▃▁
train/mean_token_accuracy,▁▃▄▅▅▆▆▆▆█
train/num_tokens,▁▂▃▃▄▅▆▇▇█

0,1
total_flos,422618046107904.0
train/epoch,0.2
train/global_step,50.0
train/grad_norm,1.1747
train/learning_rate,0.0
train/loss,1.714
train/mean_token_accuracy,0.62787
train/num_tokens,100819.0
train_loss,2.25132
train_runtime,77.1437
