# Finetuning

In [None]:
"""
CUAD Long-Context Fine-tuning using Unsloth - Optimized for 65k tokens
Based on SQuAD-style formatting for generative QA tasks
"""


'\nCUAD Long-Context Fine-tuning using Unsloth - Optimized for 65k tokens\nBased on SQuAD-style formatting for generative QA tasks\n'

In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Colab-specific installation - optimized versions
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
    !pip install transformers==4.51.3
    !pip install --no-deps unsloth

In [None]:
# # Upgrade datasets for better long-context handling
# !pip install --upgrade datasets fsspec

In [None]:
# Import required libraries
from unsloth import FastLanguageModel, UnslothTrainer, UnslothTrainingArguments
from unsloth.chat_templates import train_on_responses_only
import torch
from datasets import load_dataset
from transformers import DataCollatorForLanguageModeling
from unsloth import is_bfloat16_supported
import time
from datetime import datetime
import gc
import json
import numpy as np

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [None]:
print("="*80)
print("CUAD LONG-CONTEXT FINE-TUNING - LLAMA 3.2 3B")
print("Optimized for up to 65k token contexts")
print("="*80)
print(f"Started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

# Model configuration optimized for A100 40GB with long contexts
max_seq_length = 65536  # Full 65k context length
dtype = None  # Auto detection - will use bfloat16 on A100
load_in_4bit = True

print("\n1. Loading Llama 3.2 3B model with long-context support...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)
tokenizer.model_max_length = max_seq_length
tokenizer.init_kwargs["model_max_length"] = max_seq_length

print("2. Adding LoRA adapters - optimized for long-context QA...")
# LoRA configuration following Unsloth best practices
model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # Balanced rank - good for QA tasks # increase for increasing tuning of more parameters
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                   "gate_proj", "up_proj", "down_proj",],
                  #  "embed_tokens", "lm_head"],  # Include embedding layers for better adaptation
    lora_alpha=32,  # 2x the rank as per best practices
    lora_dropout=0,  # Optimized setting per Unsloth
    bias="none",  # Optimized setting per Unsloth
    use_gradient_checkpointing="unsloth",  # Unsloth's memory-efficient version
    random_state=3407,
    use_rslora=True,  # Rank stabilized LoRA for better training
    loftq_config=None,
)

# Explicitly set tokenizer parameters for long context
tokenizer.model_max_length = max_seq_length
tokenizer.pad_token = tokenizer.eos_token
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

print(f"Tokenizer max length set to: {tokenizer.model_max_length}")
print(f"Model max position embeddings: {model.config.max_position_embeddings}")

CUAD LONG-CONTEXT FINE-TUNING - LLAMA 3.2 3B
Optimized for up to 65k token contexts
Started at: 2025-06-07 12:05:39

1. Loading Llama 3.2 3B model with long-context support...
==((====))==  Unsloth 2025.6.1: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
2. Adding LoRA adapters - optimized for long-context QA...


Unsloth 2025.6.1 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


Tokenizer max length set to: 65536
Model max position embeddings: 131072


In [None]:
def preprocess_cuad_function(examples):
    """
    Convert CUAD dataset to SQuAD-style format for generative QA.
    Handles empty answers properly for legal document analysis.
    """
    contexts = examples["context"]
    questions = examples["question"]
    answers = []

    # Process answers - handle the SQuAD-style answer format and empty answers
    for ans in examples["answers"]:
        if ans.get("text") and len(ans["text"]) > 0:
            # Take the first non-empty answer, strip whitespace
            answer_text = next((a.strip() for a in ans["text"] if a.strip()), "")
            # If still empty after stripping, use "Not found"
            answers.append(answer_text if answer_text else "Not found")
        else:
            answers.append("Not found")  # Consistent format for empty answers

    # Create input texts - improved prompt structure
    inputs = [
        f"You are a legal document analyzer. Extract exact phrases from legal documents to answer questions. Only provide the exact text/phrases from the document that answer the question. Do not add explanations or commentary. If the information is not found, respond with 'Not found'.\n\nDocument: {ctx}\n\nQuestion: {q}\n\nAnswer (extract exact phrase from document):"
        for ctx, q in zip(contexts, questions)
    ]

    return {
        "input_text": inputs,
        "target_text": answers,
    }

def formatting_prompts_func(example):
    """
    Format examples for training following Unsloth tutorial pattern.
    This creates the proper training format with end-of-text tokens.
    """
    # Handle both single examples and batched examples
    if isinstance(example["input_text"], str):
        # Single example
        input_text = example["input_text"]
        target_text = example["target_text"]
        # Format: Input + space + target + end-of-text token
        formatted_text = f"{input_text} {target_text}<|eot_id|>"
        return [formatted_text]
    else:
        # Batched examples - return list of formatted strings
        formatted_texts = []
        for inp, tgt in zip(example["input_text"], example["target_text"]):
            formatted_text = f"{inp} {tgt}<|eot_id|>"
            formatted_texts.append(formatted_text)
        return formatted_texts

print("\n3. Loading and preprocessing CUAD dataset...")
try:
    # Load the full CUAD dataset
    cuad_dataset = load_dataset("theatticusproject/cuad-qa", trust_remote_code=True)
    print(f"Loaded {len(cuad_dataset['train'])} training examples")

    # For initial testing, you might want to use a subset
    # Uncomment the next two lines for faster experimentation
    cuad_dataset["train"] = cuad_dataset["train"].shuffle(seed=42).select(range(10000))
    print(f"Using subset of {len(cuad_dataset['train'])} examples for faster training")

except Exception as e:
    print(f"Error loading CUAD dataset: {e}")
    raise


3. Loading and preprocessing CUAD dataset...
Loaded 22450 training examples
Using subset of 10000 examples for faster training


In [None]:
print("4. Preprocessing dataset to SQuAD format...")
try:
    # Apply preprocessing to convert to input_text/target_text format
    cuad_processed = cuad_dataset.map(
        preprocess_cuad_function,
        batched=True,
        num_proc=4,  # Parallel processing
        remove_columns=cuad_dataset["train"].column_names,
        desc="Converting to SQuAD format"
    )

    print(f"Processed dataset size: {len(cuad_processed['train'])} examples")

    # Check for empty examples and filter them out
    def filter_empty_examples(example):
        return (len(example["input_text"].strip()) > 100 and  # Minimum context length
                len(example["target_text"].strip()) > 0)      # Must have some answer (including "Not found")

    cuad_processed["train"] = cuad_processed["train"].filter(filter_empty_examples)
    print(f"After filtering: {len(cuad_processed['train'])} examples")
    train_dataset = cuad_processed["train"]  # Store reference for later use

except Exception as e:
    print(f"Error preprocessing dataset: {e}")
    raise

# Show sample of the processed data
if len(cuad_processed["train"]) > 0:
    print("\nSample processed example:")
    print("-" * 80)
    sample = cuad_processed["train"][0]
    print("Input text (first 500 chars):")
    print(sample["input_text"][:500] + "...")
    print(f"\nTarget text: {sample['target_text']}")
    print("\nFormatted example (first 600 chars):")
    formatted_sample = formatting_prompts_func(sample)[0]
    print(formatted_sample[:600] + "...")
    print("-" * 80)
    print(formatted_sample[-400:])

4. Preprocessing dataset to SQuAD format...
Processed dataset size: 10000 examples
After filtering: 10000 examples

Sample processed example:
--------------------------------------------------------------------------------
Input text (first 500 chars):
You are a legal document analyzer. Extract exact phrases from legal documents to answer questions. Only provide the exact text/phrases from the document that answer the question. Do not add explanations or commentary. If the information is not found, respond with 'Not found'.

Document: Exhibit 10.1   JACKSONVILLE JAGUARS SPONSORSHIP AGREEMENT   This Sponsorship Agreement (this "Agreement") is entered into as of November 27, 2017 (the "Execution Date") by and between Jacksonville Jaguars, LLC, a...

Target text: Not found

Formatted example (first 600 chars):
You are a legal document analyzer. Extract exact phrases from legal documents to answer questions. Only provide the exact text/phrases from the document that answer the question. Do

In [None]:
# Forcefully set the tokenizer's max length. This is still good practice.
tokenizer.model_max_length = max_seq_length
tokenizer.init_kwargs["model_max_length"] = max_seq_length

# This new, single function will handle EVERYTHING: formatting, tokenizing, and masking.
def prepare_and_tokenize_dataset(examples):
    """
    This function takes a batch of examples, formats them into the full prompt-answer
    string, tokenizes them to the full max_seq_length, and creates the masked labels.
    """
    # This is the boundary marker for masking
    answer_prefix = "Answer (extract exact phrase from document):"

    # Format the full text string (prompt + answer + EOS)
    # The `formatting_prompts_func` returns a list, so we access the first element.
    texts = formatting_prompts_func(examples) # This will return a list of formatted strings

    # Tokenize the full texts to the max sequence length
    model_inputs = tokenizer(
        texts,
        truncation=True,
        padding=False, # This is essential!
        max_length=max_seq_length,
    )

    # Create the labels and apply the mask.
    # We will mask everything up to and including the answer_prefix.
    labels = []
    for i in range(len(model_inputs["input_ids"])):
        # Find the start of the answer
        # We search for the tokenized representation of our answer prefix
        # Note: We search in the input_ids of this specific example.

        # We need to find the end of the prompt to mask it.
        # Let's find the start of the answer by finding the answer_prefix tokens
        # We need to tokenize the prefix itself to find its token IDs
        # Note: add_special_tokens=False is important here!
        prefix_tokens = tokenizer.encode(answer_prefix, add_special_tokens=False)

        # Find the sequence of prefix_tokens in the input_ids
        input_ids = model_inputs["input_ids"][i]
        prompt_end_idx = -1
        for k in range(len(input_ids) - len(prefix_tokens)):
            if input_ids[k:k+len(prefix_tokens)] == prefix_tokens:
                prompt_end_idx = k + len(prefix_tokens)
                break

        # Create a copy of the input_ids to use as labels
        label = list(input_ids)

        if prompt_end_idx != -1:
            # Mask everything up to the end of the prefix
            # The +1 is to account for the space token after the prefix
            label[:prompt_end_idx ] = [-100] * (prompt_end_idx )
        else:
            # If for some reason the prefix isn't found, mask the whole sequence
            # This prevents accidental training on prompts if something goes wrong.
            label[:] = [-100] * len(label)

        labels.append(label)

    # Add the labels to our model inputs
    model_inputs["labels"] = labels

    return model_inputs

# --- Apply this new function to the dataset ---
print("Preparing and tokenizing dataset manually to full context length...")
cuad_tokenized = cuad_processed.map(
    prepare_and_tokenize_dataset,
    batched=True,
    # This function is complex, so we run it on a single process.
    num_proc=4,
    remove_columns=cuad_processed["train"].column_names,
    desc="Tokenizing and Masking all examples"
)

# --- Verification Step ---
print("\nVerifying tokenized output...")
print("Input IDs shape of first example:", len(cuad_tokenized['train'][0]['input_ids']))
print("Labels shape of first example:", len(cuad_tokenized['train'][0]['labels']))

Preparing and tokenizing dataset manually to full context length...

Verifying tokenized output...
Input IDs shape of first example: 3654
Labels shape of first example: 3654


In [None]:
print("\n5. Setting up training configuration...")

# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Training arguments optimized for long-context and A100 40GB
trainer = UnslothTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=cuad_tokenized["train"],
    # dataset_text_field=None,  # This will be transformed by formatting_func
    # formatting_func=formatting_prompts_func,  # Key function for proper formatting
    max_seq_length=max_seq_length,
    data_collator=data_collator,
    args=UnslothTrainingArguments(
        # Batch size configuration for A100 40GB with 65k context
        per_device_train_batch_size=2,      # Start with 1 due to long sequences
        gradient_accumulation_steps=4,      # Effective batch size = 8

        # Learning rate configuration
        learning_rate=2e-4,                 # Standard Unsloth learning rate
        embedding_learning_rate=1e-5,       # Lower LR for embeddings (continual pretraining)

        # Training schedule
        num_train_epochs=1,                 # Start with 1 epoch for long contexts
        warmup_ratio=0.03,                  # Smaller warmup for long training

        # Optimization settings
        optim="adamw_8bit",                 # Memory-efficient optimizer
        weight_decay=0.01,
        lr_scheduler_type="cosine",         # Cosine scheduler as in reference
        max_grad_norm=1.0,

        # Precision settings for A100
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),

        # Logging and saving
        logging_steps=5,
        save_steps=100,
        save_total_limit=3,

        # Output configuration
        output_dir="./cuad_long_context_outputs",
        report_to="none",

        # Memory optimization
        dataloader_num_workers=2,
        remove_unused_columns=True,        # Keep for custom formatting

        # Reproducibility
        seed=3407,

    ),
)

# Mask everything up through the answer prompt, un-mask only the answer tokens
# trainer = train_on_responses_only(
#     trainer,
#     instruction_part="Answer (extract exact phrase from document):",
#     response_part   ="Answer (extract exact phrase from document):"
# )

# Memory check before training
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"\nGPU: {gpu_stats.name}")
print(f"Max memory: {max_memory} GB")
print(f"Memory reserved before training: {start_gpu_memory} GB")
print(f"Available memory: {max_memory - start_gpu_memory} GB")


5. Setting up training configuration...

GPU: NVIDIA A100-SXM4-40GB
Max memory: 39.557 GB
Memory reserved before training: 3.441 GB
Available memory: 36.116 GB


In [None]:
# grab a single batch
batch = next(iter(trainer.get_train_dataloader()))
print("Input IDs shape:", batch["input_ids"].shape)
# Expect shape: [batch_size, max-seq-length in that batch]

# Also inspect one example
feature = trainer.train_dataset[0]
length_after_tokenization = len(feature["input_ids"])
print("Tokenized length of sample 0:", length_after_tokenization)

Input IDs shape: torch.Size([2, 23914])
Tokenized length of sample 0: 3654


In [None]:
# Pick a sample index to inspect (0 for the first example)
idx = 0

# Grab the labels and input_ids for that example
feature = trainer.train_dataset[idx]
labels   = feature["labels"]      # –100 for masked tokens, real IDs for un-masked
input_ids= feature["input_ids"]

# Which token IDs survive the masking?
unmasked_ids = [l for l in labels if l != -100]

# Decode them back to text
decoded_answer = tokenizer.decode(unmasked_ids)

print("=== UNMASKED TOKENS (answer) ===")
print(decoded_answer)

# (Optional) see the full prompt for context
full_prompt = tokenizer.decode(input_ids)
print("\n=== FULL PROMPT ===")
print(full_prompt)


=== UNMASKED TOKENS (answer) ===
 Not found<|eot_id|>

=== FULL PROMPT ===
<|begin_of_text|>You are a legal document analyzer. Extract exact phrases from legal documents to answer questions. Only provide the exact text/phrases from the document that answer the question. Do not add explanations or commentary. If the information is not found, respond with 'Not found'.

Document: Exhibit 10.1   JACKSONVILLE JAGUARS SPONSORSHIP AGREEMENT   This Sponsorship Agreement (this "Agreement") is entered into as of November 27, 2017 (the "Execution Date") by and between Jacksonville Jaguars, LLC, a Delaware limited liability company ("Club"), and The ARC Group, Inc., a Florida corporation (owner and operator of Dick's Wings and Grill) ("Sponsor"). This Agreement consists of this Sponsorship Agreement and Exhibits A and B hereto, each of which is incorporated into and forms a part of this Agreement by this reference.   RECITALS   A. Club owns and operates the National Football League ("NFL") team kn

In [None]:
print("\nVerifying the masking by decoding the first training example's labels...")

# 1. Grab the first processed example from the trainer's dataset
first_example_processed = trainer.train_dataset[1]

# 2. Get the list of labels. It will be a mix of token IDs and -100s.
labels = first_example_processed["labels"]

# 3. Filter out all the masked tokens (the -100s).
unmasked_label_ids = [label for label in labels if label != -100]

# 4. Decode the remaining token IDs back into a readable string.
decoded_answer = tokenizer.decode(unmasked_label_ids)

print("\n------------------------------------------------------------")
print("Decoded Unmasked Labels (This should ONLY be the answer):")
print(f"'{decoded_answer}'")
print("------------------------------------------------------------\n")

# You can also check the number of unmasked tokens
print(f"Total tokens in sequence: {len(labels)}")
print(f"Number of unmasked (answer) tokens: {len(unmasked_label_ids)}")

# ===================================================================================
# END OF VERIFICATION CODE
# ===================================================================================



Verifying the masking by decoding the first training example's labels...

------------------------------------------------------------
Decoded Unmasked Labels (This should ONLY be the answer):
' Boyd<|eot_id|>'
------------------------------------------------------------

Total tokens in sequence: 2038
Number of unmasked (answer) tokens: 2


In [None]:
# Test the formatting function with a sample
print("\n6. Testing formatting function...")
test_sample = cuad_processed["train"][0]
formatted_test = formatting_prompts_func(test_sample)
print("Formatted training example (first 800 characters):")
print(formatted_test[:800] + "..." if len(formatted_test) > 800 else formatted_test)


6. Testing formatting function...
Formatted training example (first 800 characters):
['You are a legal document analyzer. Extract exact phrases from legal documents to answer questions. Only provide the exact text/phrases from the document that answer the question. Do not add explanations or commentary. If the information is not found, respond with \'Not found\'.\n\nDocument: Exhibit 10.1   JACKSONVILLE JAGUARS SPONSORSHIP AGREEMENT   This Sponsorship Agreement (this "Agreement") is entered into as of November 27, 2017 (the "Execution Date") by and between Jacksonville Jaguars, LLC, a Delaware limited liability company ("Club"), and The ARC Group, Inc., a Florida corporation (owner and operator of Dick\'s Wings and Grill) ("Sponsor"). This Agreement consists of this Sponsorship Agreement and Exhibits A and B hereto, each of which is incorporated into and forms a part of this Agreement by this reference.   RECITALS   A. Club owns and operates the National Football League ("NFL") team k

In [None]:
print("\n7. Starting training...")
print("=" * 60)
start_time = time.time()

try:
    # Clear cache before training
    torch.cuda.empty_cache()
    gc.collect()

    # Start training
    trainer_stats = trainer.train()

    training_time = time.time() - start_time
    final_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)

    print("=" * 60)
    print("TRAINING COMPLETED SUCCESSFULLY!")
    print(f"Training time: {training_time/60:.2f} minutes")
    print(f"Peak GPU memory: {final_gpu_memory} GB")
    print(f"Memory increase: {final_gpu_memory - start_gpu_memory} GB")

    if trainer.state.log_history:
        final_loss = trainer.state.log_history[-1].get('train_loss', 'N/A')
        print(f"Final training loss: {final_loss}")

    training_successful = True

except Exception as e:
    print(f"Training failed: {e}")
    print("This might be due to memory constraints with very long sequences.")
    print("Consider reducing per_device_train_batch_size to 1 or max_seq_length.")
    training_successful = False
    raise


7. Starting training...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 10,000 | Num Epochs = 1 | Total steps = 1,250
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 24,313,856/3,000,000,000 (0.81% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
5,4.0458
10,4.9779
15,3.401
20,3.8706
25,1.9003
30,2.2858
35,2.4145
40,1.8785
45,1.4662
50,1.4917


TRAINING COMPLETED SUCCESSFULLY!
Training time: 778.85 minutes
Peak GPU memory: 19.061 GB
Memory increase: 15.620000000000001 GB
Final training loss: 0.47625361874103544


In [None]:
# Post-training code
if training_successful:
    end_time = time.time()
    training_time = end_time - start_time

    # Final memory stats - FOLLOWING UNSLOTH TUTORIAL FORMAT
    used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
    used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
    used_percentage = round(used_memory / max_memory * 100, 3)
    lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)

    print("=" * 40)
    print("TRAINING COMPLETED SUCCESSFULLY!")
    print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
    print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
    print(f"Peak reserved memory = {used_memory} GB.")
    print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
    print(f"Peak reserved memory % of max memory = {used_percentage} %.")
    print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

    print("\n8. Saving fine-tuned model...")

    # Save LoRA adapters - FOLLOWING UNSLOTH TUTORIAL
    model_save_path = "./cuad_finetuned_llama3_2_3b"
    model.save_pretrained(model_save_path)
    tokenizer.save_pretrained(model_save_path)

    # Save training info
    training_info = {
        "model_name": "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
        "dataset": "cuad",
        "training_examples": len(train_dataset),
        "max_seq_length": max_seq_length,
        "training_steps": trainer_stats.metrics.get('train_steps', 'unknown'),
        "training_time_minutes": training_time/60,
        "final_loss": trainer_stats.metrics.get('train_loss', 'unknown'),
        "saved_at": datetime.now().isoformat(),
        "lora_config": {
            "r": 16,  # Updated to match the actual config from first part
            "lora_alpha": 32,  # Updated to match the actual config from first part
            "lora_dropout": 0
        }
    }

    with open(f"{model_save_path}/training_info.json", "w") as f:
        json.dump(training_info, f, indent=2)

    print(f"Model saved to: {model_save_path}")
    print(f"Training info saved to: {model_save_path}/training_info.json")

    tokenizer.pad_token_id = tokenizer.eos_token_id  # or tokenizer.unk_token_id

    print("\n9. Quick inference test...")
    FastLanguageModel.for_inference(model)

    test_context = """
    This Software License Agreement ("Agreement") is entered into on January 1, 2024,
    between Company A and Company B. The term of this Agreement shall be for a period
    of three (3) years from the Effective Date, unless terminated earlier in accordance
    with the terms herein.
    """

    test_question = "What is the duration of this agreement?"

    # Use the same prompt format as in preprocessing
    test_prompt = f"You are a legal document analyzer. Extract exact phrases from legal documents to answer questions. Only provide the exact text/phrases from the document that answer the question. Do not add explanations or commentary. If the information is not found, respond with 'Not found'.\n\nDocument: {test_context}\n\nQuestion: {test_question}\n\nAnswer (extract exact phrase from document):"

    max_new_tokens = 64
    tokenized = tokenizer(
        test_prompt,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=max_seq_length - max_new_tokens
    ).to("cuda")

    print(f"Test Question: {test_question}")
    print("Model Response:")

    with torch.no_grad():
        outputs = model.generate(
            **tokenized,
            max_new_tokens=max_new_tokens,
            use_cache=True,
            temperature=1.5,
            min_p=0.1,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )

    # Decode only the generated part (skip the input prompt)
    generated_text = tokenizer.decode(outputs[0][tokenized['input_ids'].shape[1]:], skip_special_tokens=True)
    print(generated_text)

    print("\n" + "="*80)
    print("FINE-TUNING COMPLETED SUCCESSFULLY!")
    print("="*80)
    print(f"Model saved at: {model_save_path}")
    print("You can now use the evaluation script to test performance.")
    print("="*80)

else:
    print("Training failed. Please check the error messages above.")

print(f"\nScript completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("="*80)

TRAINING COMPLETED SUCCESSFULLY!
46728.6389 seconds used for training.
778.81 minutes used for training.
Peak reserved memory = 19.061 GB.
Peak reserved memory for training = 15.62 GB.
Peak reserved memory % of max memory = 48.186 %.
Peak reserved memory for training % of max memory = 39.487 %.

8. Saving fine-tuned model...
Model saved to: ./cuad_finetuned_llama3_2_3b
Training info saved to: ./cuad_finetuned_llama3_2_3b/training_info.json

9. Quick inference test...
Test Question: What is the duration of this agreement?
Model Response:
 The term of this Agreement shall be for a period 
    of three (3) years from the Effective Date, unless terminated earlier in accordance 
    with the terms herein.

FINE-TUNING COMPLETED SUCCESSFULLY!
Model saved at: ./cuad_finetuned_llama3_2_3b
You can now use the evaluation script to test performance.

Script completed at: 2025-06-08 01:05:56


In [None]:
!zip -r llama_finetuned.zip /content/cuad_finetuned_llama3_2_3b

  adding: content/cuad_finetuned_llama3_2_3b/ (stored 0%)
  adding: content/cuad_finetuned_llama3_2_3b/tokenizer.json (deflated 85%)
  adding: content/cuad_finetuned_llama3_2_3b/adapter_config.json (deflated 55%)
  adding: content/cuad_finetuned_llama3_2_3b/README.md (deflated 66%)
  adding: content/cuad_finetuned_llama3_2_3b/training_info.json (deflated 34%)
  adding: content/cuad_finetuned_llama3_2_3b/special_tokens_map.json (deflated 63%)
  adding: content/cuad_finetuned_llama3_2_3b/tokenizer_config.json (deflated 94%)
  adding: content/cuad_finetuned_llama3_2_3b/adapter_model.safetensors (deflated 7%)


In [None]:
from google.colab import files
files.download('llama_finetuned.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Evaluation

In [None]:
"""
CUAD-QA Evaluation Script for Llama 3.2 3B - ALIGNED WITH FINE-TUNING
Supports both original and fine-tuned models
Uses SQuAD-style evaluation metrics
Matches the exact prompt format and parameters used during fine-tuning
"""

import os
import re
import string
import json
import argparse
import torch
import time
from collections import Counter
from datasets import load_dataset
from tqdm import tqdm
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# ================== EVALUATION METRICS ==================

def normalize_answer(s):
    """Normalize answer for comparison (SQuAD-style)"""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def get_tokens(s):
    """Get normalized tokens from string"""
    if not s:
        return []
    return normalize_answer(s).split()

def exact_match_score(prediction, ground_truth):
    """Calculate exact match score"""
    return int(normalize_answer(prediction) == normalize_answer(ground_truth))

def f1_score(prediction, ground_truth):
    """Calculate F1 score between prediction and ground truth"""
    pred_tokens = get_tokens(prediction)
    truth_tokens = get_tokens(ground_truth)

    if len(pred_tokens) == 0 and len(truth_tokens) == 0:
        return 1.0
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return 0.0

    common_tokens = Counter(pred_tokens) & Counter(truth_tokens)
    num_same = sum(common_tokens.values())

    if num_same == 0:
        return 0.0

    precision = 1.0 * num_same / len(pred_tokens)
    recall = 1.0 * num_same / len(truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)

    return f1

def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
    """Calculate max metric over all ground truths"""
    if not ground_truths:
        return 0.0
    scores = []
    for ground_truth in ground_truths:
        score = metric_fn(prediction, ground_truth)
        scores.append(score)
    return max(scores)

# ================== MODEL WRAPPER - ALIGNED WITH FINE-TUNING ==================

class CUADEvaluator:
    def __init__(self, model_path, max_seq_length=65536):
        self.model_path = model_path
        self.max_seq_length = max_seq_length
        self.model = None
        self.tokenizer = None
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    def load_model(self):
        """Load model and tokenizer - ALIGNED WITH FINE-TUNING SCRIPT"""
        print(f"Loading model from: {self.model_path}")
        print(f"Max sequence length: {self.max_seq_length}")

        try:
            # Use same configuration as fine-tuning script
            self.model, self.tokenizer = FastLanguageModel.from_pretrained(
                model_name=self.model_path,
                max_seq_length=self.max_seq_length,
                dtype=None,  # Auto detection like in fine-tuning
                load_in_4bit=True,
                device_map="auto",
            )

            # Set pad_token_id like in fine-tuning script
            self.tokenizer.pad_token_id = self.tokenizer.eos_token_id

            # Enable inference mode - FOLLOWING UNSLOTH TUTORIAL
            FastLanguageModel.for_inference(self.model)
            self.model.eval()

            print("✅ Model loaded successfully!")
            return True

        except Exception as e:
            print(f"❌ Error loading model: {e}")
            return False

    def clean_response(self, response):
        """Clean and extract answer from model response - IMPROVED"""
        if not response:
            return "Not found"

        # Remove common prefixes that models add
        prefixes_to_remove = [
            "Based on the provided contract",
            "Upon reviewing the contract",
            "The part of this contract",
            "The parts of this contract",
            "After analyzing the contract",
            "Upon analyzing the contract",
            "In this contract",
            "The contract",
            "Looking at",
            "According to",
            "From the contract",
            "The relevant",
            "Here is",
            "Here are",
            "The answer is",
            "Answer:",
            "Response:",
            "The exact phrase",
            "Extract exact phrase",
        ]

        cleaned = response.strip()

        # Remove common prefixes (case insensitive)
        for prefix in prefixes_to_remove:
            if cleaned.lower().startswith(prefix.lower()):
                cleaned = cleaned[len(prefix):].strip()
                # Remove common continuation patterns
                if cleaned.startswith(":"):
                    cleaned = cleaned[1:].strip()
                if cleaned.startswith(","):
                    cleaned = cleaned[1:].strip()
                break

        # Split by common separators and take the first substantive part
        separators = ['\n', '. ', ':', ' that should be reviewed', ' are:', ' is:']
        for sep in separators:
            if sep in cleaned:
                parts = cleaned.split(sep)
                if len(parts) > 1:
                    # Take the first non-empty meaningful part
                    for part in parts:
                        part = part.strip()
                        if part and len(part) > 3 and not part.lower().startswith(('the following', 'as follows')):
                            cleaned = part
                            break
                break

        # Final cleanup
        cleaned = cleaned.strip()

        # Remove quotes if they wrap the entire answer
        if cleaned.startswith('"') and cleaned.endswith('"'):
            cleaned = cleaned[1:-1].strip()
        if cleaned.startswith("'") and cleaned.endswith("'"):
            cleaned = cleaned[1:-1].strip()

        # If still too verbose or empty, return "Not found"
        if not cleaned or len(cleaned) > 200 or cleaned.lower().startswith(('there is no', 'no specific', 'i did not find', 'not found')):
            return "Not found"

        return cleaned

    def generate_answer(self, question, context, max_new_tokens=64):
        """Generate answer - EXACTLY MATCHING FINE-TUNING PROMPT FORMAT"""

        # Context truncation - same logic as fine-tuning
        max_context_length = self.max_seq_length - 1000
        if len(context) > max_context_length:
            # Keep beginning and end of context
            half_length = max_context_length // 2
            context = context[:half_length] + "\n[...TRUNCATED...]\n" + context[-half_length:]

        # EXACT SAME PROMPT FORMAT AS FINE-TUNING SCRIPT
        prompt = f"You are a legal document analyzer. Extract exact phrases from legal documents to answer questions. Only provide the exact text/phrases from the document that answer the question. Do not add explanations or commentary. If the information is not found, respond with 'Not found'.\n\nDocument: {context}\n\nQuestion: {question}\n\nAnswer (extract exact phrase from document):"

        try:
            # Tokenize with same approach as fine-tuning script
            tokenized = self.tokenizer(
                prompt,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=self.max_seq_length - max_new_tokens
            ).to(self.device)

            with torch.no_grad():
                outputs = self.model.generate(
                    **tokenized,
                    max_new_tokens=max_new_tokens,
                    use_cache=True,
                    temperature=1.5,  # Same as fine-tuning script inference test
                    min_p=0.1,       # Same as fine-tuning script inference test
                    do_sample=True,  # Same as fine-tuning script
                    pad_token_id=self.tokenizer.pad_token_id,
                    eos_token_id=self.tokenizer.eos_token_id,
                )

            # Decode only the generated part (skip the input prompt)
            generated_text = self.tokenizer.decode(
                outputs[0][tokenized['input_ids'].shape[1]:],
                skip_special_tokens=True
            ).strip()

            # Apply post-processing
            cleaned_response = self.clean_response(generated_text)

            return cleaned_response

        except Exception as e:
            print(f"Error during generation: {e}")
            return "Error generating response"

# ================== EVALUATION FUNCTIONS ==================

def evaluate_dataset(evaluator, dataset, max_samples=None, save_predictions=True, output_dir="./"):
    """Evaluate model on dataset"""

    if max_samples is not None:
        dataset = dataset.select(range(min(max_samples, len(dataset))))

    print(f"Evaluating on {len(dataset)} samples...")

    predictions = {}
    detailed_results = []
    total_em = 0
    total_f1 = 0

    start_time = time.time()

    for i, example in enumerate(tqdm(dataset, desc="Evaluating")):
        try:
            question_id = example.get('id', f"question_{i}")
            question = example['question']
            context = example['context']

            # Handle answers in CUAD format - same as fine-tuning preprocessing
            if example['answers']['text'] and len(example['answers']['text']) > 0:
                # Take the first non-empty answer, strip whitespace
                ground_truths = [next((a.strip() for a in example['answers']['text'] if a.strip()), "Not found")]
                # If still empty after stripping, use "Not found"
                if not ground_truths[0]:
                    ground_truths = ["Not found"]
            else:
                ground_truths = ["Not found"]  # Consistent with fine-tuning preprocessing

            # Generate prediction
            prediction = evaluator.generate_answer(question, context)
            predictions[question_id] = prediction

            # Calculate metrics
            em = metric_max_over_ground_truths(exact_match_score, prediction, ground_truths)
            f1 = metric_max_over_ground_truths(f1_score, prediction, ground_truths)

            total_em += em
            total_f1 += f1

            # Store detailed result
            detailed_results.append({
                'id': question_id,
                'question': question[:200] + "..." if len(question) > 200 else question,
                'context_length': len(context),
                'ground_truths': ground_truths,
                'prediction': prediction,
                'exact_match': em,
                'f1_score': f1
            })

            if (i + 1) % 10 == 0:
                current_em = (total_em / (i + 1)) * 100
                current_f1 = (total_f1 / (i + 1)) * 100
                print(f"Progress: {i+1}/{len(dataset)} | EM: {current_em:.2f}% | F1: {current_f1:.2f}%")

        except Exception as e:
            print(f"Error processing example {i}: {e}")
            continue

    end_time = time.time()
    evaluation_time = end_time - start_time

    # Calculate final metrics
    num_evaluated = len(detailed_results)
    if num_evaluated == 0:
        print("❌ No examples were successfully evaluated!")
        return None

    final_em = (total_em / num_evaluated) * 100
    final_f1 = (total_f1 / num_evaluated) * 100

    # Prepare results
    results = {
        'model_path': evaluator.model_path,
        'dataset_size': num_evaluated,
        'exact_match': final_em,
        'f1_score': final_f1,
        'evaluation_time_seconds': evaluation_time,
        'samples_per_second': num_evaluated / evaluation_time,
        'timestamp': datetime.now().isoformat(),
        'detailed_results': detailed_results,
    }

    # Print summary
    print("\n" + "="*80)
    print("EVALUATION RESULTS")
    print("="*80)
    print(f"Model: {evaluator.model_path}")
    print(f"Samples Evaluated: {num_evaluated}")
    print(f"Exact Match: {final_em:.2f}%")
    print(f"F1 Score: {final_f1:.2f}%")
    print(f"Evaluation Time: {evaluation_time/60:.2f} minutes")
    print(f"Speed: {num_evaluated/evaluation_time:.2f} samples/second")
    print("="*80)

    # Save results
    if save_predictions:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        model_name = os.path.basename(evaluator.model_path).replace("/", "_")

        # Save predictions
        pred_file = os.path.join(output_dir, f"predictions_{model_name}_{timestamp}.json")
        with open(pred_file, 'w') as f:
            json.dump(predictions, f, indent=2)

        # Save detailed results
        results_file = os.path.join(output_dir, f"results_{model_name}_{timestamp}.json")
        with open(results_file, 'w') as f:
            json.dump(results, f, indent=2)

        print(f"📁 Predictions saved to: {pred_file}")
        print(f"📁 Detailed results saved to: {results_file}")

    return results

# ================== MAIN FUNCTION ==================

def main():
    parser = argparse.ArgumentParser(description="Evaluate Llama 3.2 3B on CUAD dataset")
    parser.add_argument("--model_path",
                       default="unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
                       help="HuggingFace model repo or local path to fine-tuned model")
    parser.add_argument("--finetuned_model_path",
                       default="./cuad_finetuned_llama3_2_3b",
                       help="Path to fine-tuned model for comparison")
    parser.add_argument("--split",
                       default="test",
                       choices=["train", "test"],
                       help="Dataset split to evaluate on")
    parser.add_argument("--max_samples",
                       type=int,
                       default=4182,
                       help="Maximum number of samples to evaluate")
    parser.add_argument("--evaluate_original",
                       action="store_true",
                       default=True,
                       help="Evaluate original model")
    parser.add_argument("--evaluate_finetuned",
                       action="store_true",
                       default=True,
                       help="Evaluate fine-tuned model")
    parser.add_argument("--output_dir",
                       default="./evaluation_results",
                       help="Directory to save results")
    parser.add_argument("--max_seq_length",
                       type=int,
                       default=65536,
                       help="Maximum sequence length - should match fine-tuning")

    args, _ = parser.parse_known_args()

    # Create output directory
    os.makedirs(args.output_dir, exist_ok=True)

    print("🚀 CUAD EVALUATION SCRIPT - ALIGNED WITH FINE-TUNING")
    print("="*60)
    print(f"Split: {args.split}")
    print(f"Max samples: {args.max_samples}")
    print(f"Max sequence length: {args.max_seq_length}")
    print(f"Output directory: {args.output_dir}")

    # Load dataset
    print(f"\n📂 Loading CUAD {args.split} dataset...")
    try:
        dataset = load_dataset("theatticusproject/cuad-qa", split=args.split, trust_remote_code=True)
        print(f"✅ Loaded {len(dataset)} examples")
    except Exception as e:
        print(f"❌ Error loading dataset: {e}")
        return

    results_summary = []

    # Evaluate original model
    if args.evaluate_original:
        print(f"\n🔄 EVALUATING ORIGINAL MODEL")
        print("="*50)

        evaluator_original = CUADEvaluator(
            model_path=args.model_path,
            max_seq_length=args.max_seq_length,
        )

        if evaluator_original.load_model():
            original_results = evaluate_dataset(
                evaluator_original,
                dataset,
                max_samples=args.max_samples,
                output_dir=args.output_dir
            )
            if original_results:
                results_summary.append(('Original', original_results))

        # Clear memory
        del evaluator_original
        torch.cuda.empty_cache()

    # Evaluate fine-tuned model
    if args.evaluate_finetuned:
        print(f"\n🔄 EVALUATING FINE-TUNED MODEL")
        print("="*50)

        if not os.path.exists(args.finetuned_model_path):
            print(f"⚠️  Fine-tuned model not found at: {args.finetuned_model_path}")
            print("Skipping fine-tuned evaluation...")
        else:
            evaluator_finetuned = CUADEvaluator(
                model_path=args.finetuned_model_path,
                max_seq_length=args.max_seq_length,
            )

            if evaluator_finetuned.load_model():
                finetuned_results = evaluate_dataset(
                    evaluator_finetuned,
                    dataset,
                    max_samples=args.max_samples,
                    output_dir=args.output_dir
                )
                if finetuned_results:
                    results_summary.append(('Fine-tuned', finetuned_results))

            # Clear memory
            del evaluator_finetuned
            torch.cuda.empty_cache()

    # Compare results
    if len(results_summary) >= 2:
        print(f"\n📊 COMPARISON SUMMARY")
        print("="*80)
        print(f"{'Model':<15} {'Exact Match':<15} {'F1 Score':<15} {'Samples':<10} {'Time (min)':<12}")
        print("-" * 80)

        for model_name, results in results_summary:
            print(f"{model_name:<15} {results['exact_match']:<15.2f} "
                  f"{results['f1_score']:<15.2f} {results['dataset_size']:<10} "
                  f"{results['evaluation_time_seconds']/60:<12.2f}")

        # Calculate improvement
        if len(results_summary) == 2:
            original_em = results_summary[0][1]['exact_match']
            original_f1 = results_summary[0][1]['f1_score']
            finetuned_em = results_summary[1][1]['exact_match']
            finetuned_f1 = results_summary[1][1]['f1_score']

            em_improvement = finetuned_em - original_em
            f1_improvement = finetuned_f1 - original_f1

            print(f"\n🎯 IMPROVEMENT:")
            print(f"Exact Match: {em_improvement:+.2f} percentage points")
            print(f"F1 Score: {f1_improvement:+.2f} percentage points")

        print("="*80)

    print(f"\n✅ Evaluation completed! Results saved in: {args.output_dir}")

if __name__ == "__main__":
    main()

🚀 CUAD EVALUATION SCRIPT - ALIGNED WITH FINE-TUNING
Split: test
Max samples: 4182
Max sequence length: 65536
Output directory: ./evaluation_results

📂 Loading CUAD test dataset...
✅ Loaded 4182 examples

🔄 EVALUATING ORIGINAL MODEL
Loading model from: unsloth/Llama-3.2-3B-Instruct-bnb-4bit
Max sequence length: 65536
==((====))==  Unsloth 2025.6.1: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
✅ Model loaded successfully!
Evaluating on 4182 samples...


Evaluating:   0%|          | 10/4182 [00:19<1:40:54,  1.45s/it]

Progress: 10/4182 | EM: 40.00% | F1: 60.49%


Evaluating:   0%|          | 20/4182 [00:36<2:00:02,  1.73s/it]

Progress: 20/4182 | EM: 40.00% | F1: 54.25%


Evaluating:   1%|          | 30/4182 [00:54<1:18:45,  1.14s/it]

Progress: 30/4182 | EM: 50.00% | F1: 60.61%


Evaluating:   1%|          | 40/4182 [01:15<2:41:40,  2.34s/it]

Progress: 40/4182 | EM: 45.00% | F1: 53.54%


Evaluating:   1%|          | 50/4182 [01:35<2:08:48,  1.87s/it]

Progress: 50/4182 | EM: 44.00% | F1: 51.31%


Evaluating:   1%|▏         | 60/4182 [01:59<2:53:11,  2.52s/it]

Progress: 60/4182 | EM: 45.00% | F1: 53.31%


Evaluating:   2%|▏         | 70/4182 [02:10<1:32:54,  1.36s/it]

Progress: 70/4182 | EM: 44.29% | F1: 52.04%


Evaluating:   2%|▏         | 80/4182 [02:32<2:47:54,  2.46s/it]

Progress: 80/4182 | EM: 43.75% | F1: 51.27%


Evaluating:   2%|▏         | 90/4182 [03:03<3:34:38,  3.15s/it]

Progress: 90/4182 | EM: 38.89% | F1: 46.13%


Evaluating:   2%|▏         | 100/4182 [03:34<3:36:43,  3.19s/it]

Progress: 100/4182 | EM: 35.00% | F1: 41.82%


Evaluating:   3%|▎         | 110/4182 [04:06<3:35:24,  3.17s/it]

Progress: 110/4182 | EM: 31.82% | F1: 38.20%


Evaluating:   3%|▎         | 120/4182 [04:38<3:35:00,  3.18s/it]

Progress: 120/4182 | EM: 29.17% | F1: 35.05%


Evaluating:   3%|▎         | 130/4182 [05:10<3:36:03,  3.20s/it]

Progress: 130/4182 | EM: 26.92% | F1: 32.59%


Evaluating:   3%|▎         | 140/4182 [05:42<3:35:51,  3.20s/it]

Progress: 140/4182 | EM: 25.00% | F1: 30.43%


Evaluating:   4%|▎         | 150/4182 [06:14<3:34:13,  3.19s/it]

Progress: 150/4182 | EM: 23.33% | F1: 28.55%


Evaluating:   4%|▍         | 160/4182 [06:46<3:35:18,  3.21s/it]

Progress: 160/4182 | EM: 21.88% | F1: 26.94%


Evaluating:   4%|▍         | 170/4182 [07:11<2:41:31,  2.42s/it]

Progress: 170/4182 | EM: 21.18% | F1: 26.71%


Evaluating:   4%|▍         | 180/4182 [07:35<2:32:11,  2.28s/it]

Progress: 180/4182 | EM: 22.22% | F1: 27.49%


Evaluating:   5%|▍         | 190/4182 [07:46<1:06:01,  1.01it/s]

Progress: 190/4182 | EM: 24.21% | F1: 29.26%


Evaluating:   5%|▍         | 200/4182 [08:00<57:23,  1.16it/s]  

Progress: 200/4182 | EM: 26.00% | F1: 31.04%


Evaluating:   5%|▌         | 210/4182 [08:23<2:48:39,  2.55s/it]

Progress: 210/4182 | EM: 25.71% | F1: 31.18%


Evaluating:   5%|▌         | 220/4182 [08:51<3:05:10,  2.80s/it]

Progress: 220/4182 | EM: 26.82% | F1: 32.18%


Evaluating:   5%|▌         | 230/4182 [09:14<2:49:09,  2.57s/it]

Progress: 230/4182 | EM: 26.96% | F1: 32.27%


Evaluating:   6%|▌         | 240/4182 [09:33<2:09:47,  1.98s/it]

Progress: 240/4182 | EM: 27.50% | F1: 33.03%


Evaluating:   6%|▌         | 250/4182 [09:54<2:03:26,  1.88s/it]

Progress: 250/4182 | EM: 27.20% | F1: 33.11%


Evaluating:   6%|▌         | 260/4182 [10:11<2:00:33,  1.84s/it]

Progress: 260/4182 | EM: 26.92% | F1: 33.01%


Evaluating:   6%|▋         | 270/4182 [10:25<2:01:27,  1.86s/it]

Progress: 270/4182 | EM: 27.41% | F1: 33.31%


Evaluating:   7%|▋         | 280/4182 [10:43<1:52:24,  1.73s/it]

Progress: 280/4182 | EM: 27.86% | F1: 33.60%


Evaluating:   7%|▋         | 290/4182 [10:59<2:26:15,  2.25s/it]

Progress: 290/4182 | EM: 27.24% | F1: 33.47%


Evaluating:   7%|▋         | 300/4182 [11:27<2:52:56,  2.67s/it]

Progress: 300/4182 | EM: 26.33% | F1: 32.51%


Evaluating:   7%|▋         | 310/4182 [11:50<2:42:57,  2.53s/it]

Progress: 310/4182 | EM: 26.13% | F1: 32.33%


Evaluating:   8%|▊         | 320/4182 [12:11<2:24:42,  2.25s/it]

Progress: 320/4182 | EM: 26.88% | F1: 32.89%


Evaluating:   8%|▊         | 330/4182 [12:29<1:59:02,  1.85s/it]

Progress: 330/4182 | EM: 26.97% | F1: 32.96%


Evaluating:   8%|▊         | 340/4182 [12:50<2:23:22,  2.24s/it]

Progress: 340/4182 | EM: 27.35% | F1: 33.16%


Evaluating:   8%|▊         | 350/4182 [13:10<1:37:16,  1.52s/it]

Progress: 350/4182 | EM: 28.86% | F1: 34.64%


Evaluating:   9%|▊         | 360/4182 [13:31<1:52:41,  1.77s/it]

Progress: 360/4182 | EM: 28.89% | F1: 34.68%


Evaluating:   9%|▉         | 370/4182 [13:53<2:37:53,  2.49s/it]

Progress: 370/4182 | EM: 28.65% | F1: 34.31%


Evaluating:   9%|▉         | 380/4182 [14:14<2:34:21,  2.44s/it]

Progress: 380/4182 | EM: 28.16% | F1: 33.93%


Evaluating:   9%|▉         | 390/4182 [14:34<1:58:25,  1.87s/it]

Progress: 390/4182 | EM: 28.21% | F1: 33.89%


Evaluating:  10%|▉         | 400/4182 [15:00<2:49:11,  2.68s/it]

Progress: 400/4182 | EM: 28.75% | F1: 34.34%


Evaluating:  10%|▉         | 410/4182 [15:27<2:52:00,  2.74s/it]

Progress: 410/4182 | EM: 29.27% | F1: 34.86%


Evaluating:  10%|█         | 420/4182 [15:51<2:29:07,  2.38s/it]

Progress: 420/4182 | EM: 28.81% | F1: 34.45%


Evaluating:  10%|█         | 430/4182 [16:17<2:18:57,  2.22s/it]

Progress: 430/4182 | EM: 28.84% | F1: 34.50%


Evaluating:  11%|█         | 440/4182 [16:39<2:05:27,  2.01s/it]

Progress: 440/4182 | EM: 28.86% | F1: 34.64%


Evaluating:  11%|█         | 450/4182 [17:06<2:58:23,  2.87s/it]

Progress: 450/4182 | EM: 29.11% | F1: 34.97%


Evaluating:  11%|█         | 460/4182 [17:24<2:00:12,  1.94s/it]

Progress: 460/4182 | EM: 29.35% | F1: 35.44%


Evaluating:  11%|█         | 470/4182 [17:33<1:48:36,  1.76s/it]

Progress: 470/4182 | EM: 30.43% | F1: 36.41%


Evaluating:  11%|█▏        | 480/4182 [17:54<1:43:35,  1.68s/it]

Progress: 480/4182 | EM: 31.04% | F1: 36.94%


Evaluating:  12%|█▏        | 490/4182 [18:07<1:29:22,  1.45s/it]

Progress: 490/4182 | EM: 31.84% | F1: 37.74%


Evaluating:  12%|█▏        | 500/4182 [18:31<2:43:45,  2.67s/it]

Progress: 500/4182 | EM: 31.40% | F1: 37.40%


Evaluating:  12%|█▏        | 510/4182 [18:55<2:32:17,  2.49s/it]

Progress: 510/4182 | EM: 31.96% | F1: 37.89%


Evaluating:  12%|█▏        | 520/4182 [19:15<1:23:25,  1.37s/it]

Progress: 520/4182 | EM: 31.73% | F1: 37.57%


Evaluating:  13%|█▎        | 530/4182 [19:40<2:39:27,  2.62s/it]

Progress: 530/4182 | EM: 31.89% | F1: 37.63%


Evaluating:  13%|█▎        | 540/4182 [20:03<2:29:24,  2.46s/it]

Progress: 540/4182 | EM: 31.85% | F1: 37.59%


Evaluating:  13%|█▎        | 550/4182 [20:28<2:26:55,  2.43s/it]

Progress: 550/4182 | EM: 31.64% | F1: 37.49%


Evaluating:  13%|█▎        | 560/4182 [20:54<2:33:06,  2.54s/it]

Progress: 560/4182 | EM: 31.43% | F1: 37.32%


Evaluating:  14%|█▎        | 570/4182 [21:19<2:30:59,  2.51s/it]

Progress: 570/4182 | EM: 31.40% | F1: 37.20%


Evaluating:  14%|█▍        | 580/4182 [21:47<2:59:30,  2.99s/it]

Progress: 580/4182 | EM: 31.21% | F1: 36.92%


Evaluating:  14%|█▍        | 590/4182 [22:18<3:00:59,  3.02s/it]

Progress: 590/4182 | EM: 30.85% | F1: 36.51%


Evaluating:  14%|█▍        | 600/4182 [22:48<3:00:23,  3.02s/it]

Progress: 600/4182 | EM: 30.50% | F1: 36.08%


Evaluating:  15%|█▍        | 610/4182 [23:18<2:59:10,  3.01s/it]

Progress: 610/4182 | EM: 30.16% | F1: 35.69%


Evaluating:  15%|█▍        | 620/4182 [23:46<2:47:13,  2.82s/it]

Progress: 620/4182 | EM: 29.84% | F1: 35.36%


Evaluating:  15%|█▌        | 630/4182 [24:15<2:53:39,  2.93s/it]

Progress: 630/4182 | EM: 30.00% | F1: 35.59%


Evaluating:  15%|█▌        | 640/4182 [24:39<2:34:33,  2.62s/it]

Progress: 640/4182 | EM: 29.53% | F1: 35.14%


Evaluating:  16%|█▌        | 650/4182 [25:07<2:47:44,  2.85s/it]

Progress: 650/4182 | EM: 29.38% | F1: 34.93%


Evaluating:  16%|█▌        | 660/4182 [25:35<2:40:53,  2.74s/it]

Progress: 660/4182 | EM: 29.39% | F1: 34.99%


Evaluating:  16%|█▌        | 670/4182 [26:00<2:35:10,  2.65s/it]

Progress: 670/4182 | EM: 29.55% | F1: 35.32%


Evaluating:  16%|█▋        | 680/4182 [26:19<1:01:10,  1.05s/it]

Progress: 680/4182 | EM: 29.85% | F1: 35.58%


Evaluating:  16%|█▋        | 690/4182 [26:38<1:55:00,  1.98s/it]

Progress: 690/4182 | EM: 29.86% | F1: 35.60%


Evaluating:  17%|█▋        | 700/4182 [27:03<2:32:13,  2.62s/it]

Progress: 700/4182 | EM: 29.71% | F1: 35.45%


Evaluating:  17%|█▋        | 710/4182 [27:23<1:37:25,  1.68s/it]

Progress: 710/4182 | EM: 29.58% | F1: 35.44%


Evaluating:  17%|█▋        | 720/4182 [27:46<2:15:12,  2.34s/it]

Progress: 720/4182 | EM: 29.44% | F1: 35.41%


Evaluating:  17%|█▋        | 730/4182 [28:04<1:44:54,  1.82s/it]

Progress: 730/4182 | EM: 29.45% | F1: 35.42%


Evaluating:  18%|█▊        | 740/4182 [28:33<2:49:31,  2.96s/it]

Progress: 740/4182 | EM: 29.32% | F1: 35.30%


Evaluating:  18%|█▊        | 750/4182 [29:05<3:02:23,  3.19s/it]

Progress: 750/4182 | EM: 28.93% | F1: 34.87%


Evaluating:  18%|█▊        | 760/4182 [29:36<3:01:40,  3.19s/it]

Progress: 760/4182 | EM: 28.55% | F1: 34.47%


Evaluating:  18%|█▊        | 770/4182 [30:08<3:01:46,  3.20s/it]

Progress: 770/4182 | EM: 28.18% | F1: 34.12%


Evaluating:  19%|█▊        | 780/4182 [30:39<2:33:36,  2.71s/it]

Progress: 780/4182 | EM: 27.82% | F1: 33.76%


Evaluating:  19%|█▉        | 790/4182 [31:00<1:48:43,  1.92s/it]

Progress: 790/4182 | EM: 28.10% | F1: 34.13%


Evaluating:  19%|█▉        | 800/4182 [31:16<2:00:10,  2.13s/it]

Progress: 800/4182 | EM: 28.62% | F1: 34.68%


Evaluating:  19%|█▉        | 810/4182 [31:36<2:10:07,  2.32s/it]

Progress: 810/4182 | EM: 29.38% | F1: 35.36%


Evaluating:  20%|█▉        | 820/4182 [31:53<1:15:30,  1.35s/it]

Progress: 820/4182 | EM: 30.12% | F1: 36.03%


Evaluating:  20%|█▉        | 830/4182 [32:26<2:57:10,  3.17s/it]

Progress: 830/4182 | EM: 29.76% | F1: 35.62%


Evaluating:  20%|██        | 840/4182 [32:58<3:01:58,  3.27s/it]

Progress: 840/4182 | EM: 29.40% | F1: 35.24%


Evaluating:  20%|██        | 850/4182 [33:31<3:01:45,  3.27s/it]

Progress: 850/4182 | EM: 29.06% | F1: 34.83%


Evaluating:  21%|██        | 860/4182 [34:04<3:00:46,  3.26s/it]

Progress: 860/4182 | EM: 28.72% | F1: 34.43%


Evaluating:  21%|██        | 870/4182 [34:34<2:46:41,  3.02s/it]

Progress: 870/4182 | EM: 28.39% | F1: 34.11%


Evaluating:  21%|██        | 880/4182 [35:03<2:36:46,  2.85s/it]

Progress: 880/4182 | EM: 28.07% | F1: 33.74%


Evaluating:  21%|██▏       | 890/4182 [35:35<2:55:33,  3.20s/it]

Progress: 890/4182 | EM: 27.75% | F1: 33.37%


Evaluating:  22%|██▏       | 900/4182 [36:04<2:28:37,  2.72s/it]

Progress: 900/4182 | EM: 27.44% | F1: 33.01%


Evaluating:  22%|██▏       | 910/4182 [36:26<1:27:57,  1.61s/it]

Progress: 910/4182 | EM: 27.36% | F1: 32.92%


Evaluating:  22%|██▏       | 920/4182 [36:38<32:38,  1.67it/s]

Progress: 920/4182 | EM: 27.83% | F1: 33.33%


Evaluating:  22%|██▏       | 930/4182 [36:51<1:13:14,  1.35s/it]

Progress: 930/4182 | EM: 27.85% | F1: 33.35%


Evaluating:  22%|██▏       | 940/4182 [36:59<1:03:57,  1.18s/it]

Progress: 940/4182 | EM: 28.19% | F1: 33.71%


Evaluating:  23%|██▎       | 950/4182 [37:20<1:33:48,  1.74s/it]

Progress: 950/4182 | EM: 28.21% | F1: 33.78%


Evaluating:  23%|██▎       | 960/4182 [37:44<1:43:54,  1.94s/it]

Progress: 960/4182 | EM: 28.54% | F1: 34.06%


Evaluating:  23%|██▎       | 970/4182 [38:04<1:36:21,  1.80s/it]

Progress: 970/4182 | EM: 28.76% | F1: 34.32%


Evaluating:  23%|██▎       | 980/4182 [38:26<2:03:43,  2.32s/it]

Progress: 980/4182 | EM: 28.98% | F1: 34.63%


Evaluating:  24%|██▎       | 990/4182 [38:56<2:45:21,  3.11s/it]

Progress: 990/4182 | EM: 28.89% | F1: 34.53%


Evaluating:  24%|██▍       | 1000/4182 [39:27<2:48:23,  3.18s/it]

Progress: 1000/4182 | EM: 28.60% | F1: 34.22%


Evaluating:  24%|██▍       | 1010/4182 [39:59<2:47:55,  3.18s/it]

Progress: 1010/4182 | EM: 28.32% | F1: 33.93%


Evaluating:  24%|██▍       | 1020/4182 [40:31<2:47:41,  3.18s/it]

Progress: 1020/4182 | EM: 28.04% | F1: 33.60%


Evaluating:  25%|██▍       | 1030/4182 [40:56<2:19:26,  2.65s/it]

Progress: 1030/4182 | EM: 27.86% | F1: 33.43%


Evaluating:  25%|██▍       | 1040/4182 [41:18<1:49:59,  2.10s/it]

Progress: 1040/4182 | EM: 27.69% | F1: 33.36%


Evaluating:  25%|██▌       | 1050/4182 [41:44<2:11:55,  2.53s/it]

Progress: 1050/4182 | EM: 27.52% | F1: 33.25%


Evaluating:  25%|██▌       | 1060/4182 [42:09<2:12:32,  2.55s/it]

Progress: 1060/4182 | EM: 27.55% | F1: 33.25%


Evaluating:  26%|██▌       | 1070/4182 [42:36<2:29:15,  2.88s/it]

Progress: 1070/4182 | EM: 27.29% | F1: 33.04%


Evaluating:  26%|██▌       | 1080/4182 [43:08<2:40:06,  3.10s/it]

Progress: 1080/4182 | EM: 27.04% | F1: 32.75%


Evaluating:  26%|██▌       | 1090/4182 [43:38<2:38:33,  3.08s/it]

Progress: 1090/4182 | EM: 26.79% | F1: 32.46%


Evaluating:  26%|██▋       | 1100/4182 [44:10<2:39:30,  3.11s/it]

Progress: 1100/4182 | EM: 26.64% | F1: 32.26%


Evaluating:  27%|██▋       | 1110/4182 [44:37<1:53:06,  2.21s/it]

Progress: 1110/4182 | EM: 26.40% | F1: 32.04%


Evaluating:  27%|██▋       | 1120/4182 [44:53<57:17,  1.12s/it]  

Progress: 1120/4182 | EM: 26.34% | F1: 32.00%


Evaluating:  27%|██▋       | 1130/4182 [45:10<1:36:50,  1.90s/it]

Progress: 1130/4182 | EM: 26.19% | F1: 31.92%


Evaluating:  27%|██▋       | 1140/4182 [45:17<1:10:54,  1.40s/it]

Progress: 1140/4182 | EM: 26.32% | F1: 32.04%


Evaluating:  27%|██▋       | 1150/4182 [45:34<1:47:02,  2.12s/it]

Progress: 1150/4182 | EM: 26.43% | F1: 32.13%


Evaluating:  28%|██▊       | 1160/4182 [46:05<2:34:54,  3.08s/it]

Progress: 1160/4182 | EM: 26.21% | F1: 31.88%


Evaluating:  28%|██▊       | 1170/4182 [46:36<2:35:17,  3.09s/it]

Progress: 1170/4182 | EM: 25.98% | F1: 31.60%


Evaluating:  28%|██▊       | 1180/4182 [47:07<2:35:30,  3.11s/it]

Progress: 1180/4182 | EM: 25.76% | F1: 31.34%


Evaluating:  28%|██▊       | 1190/4182 [47:38<2:28:23,  2.98s/it]

Progress: 1190/4182 | EM: 25.55% | F1: 31.14%


Evaluating:  29%|██▊       | 1200/4182 [48:03<2:01:45,  2.45s/it]

Progress: 1200/4182 | EM: 25.42% | F1: 31.11%


Evaluating:  29%|██▉       | 1210/4182 [48:27<2:07:03,  2.56s/it]

Progress: 1210/4182 | EM: 25.45% | F1: 31.24%


Evaluating:  29%|██▉       | 1220/4182 [48:48<1:49:54,  2.23s/it]

Progress: 1220/4182 | EM: 25.74% | F1: 31.54%


Evaluating:  29%|██▉       | 1230/4182 [49:11<1:54:21,  2.32s/it]

Progress: 1230/4182 | EM: 26.02% | F1: 31.87%


Evaluating:  30%|██▉       | 1240/4182 [49:42<2:30:24,  3.07s/it]

Progress: 1240/4182 | EM: 25.81% | F1: 31.66%


Evaluating:  30%|██▉       | 1250/4182 [50:13<2:31:20,  3.10s/it]

Progress: 1250/4182 | EM: 25.60% | F1: 31.41%


Evaluating:  30%|███       | 1260/4182 [50:44<2:30:46,  3.10s/it]

Progress: 1260/4182 | EM: 25.40% | F1: 31.17%


Evaluating:  30%|███       | 1270/4182 [51:14<2:29:49,  3.09s/it]

Progress: 1270/4182 | EM: 25.20% | F1: 30.93%


Evaluating:  31%|███       | 1280/4182 [51:46<2:31:44,  3.14s/it]

Progress: 1280/4182 | EM: 25.00% | F1: 30.69%


Evaluating:  31%|███       | 1290/4182 [52:17<2:31:17,  3.14s/it]

Progress: 1290/4182 | EM: 24.81% | F1: 30.46%


Evaluating:  31%|███       | 1300/4182 [52:48<2:30:30,  3.13s/it]

Progress: 1300/4182 | EM: 24.62% | F1: 30.28%


Evaluating:  31%|███▏      | 1310/4182 [53:20<2:30:27,  3.14s/it]

Progress: 1310/4182 | EM: 24.43% | F1: 30.10%


Evaluating:  32%|███▏      | 1320/4182 [53:45<1:48:33,  2.28s/it]

Progress: 1320/4182 | EM: 24.39% | F1: 30.06%


Evaluating:  32%|███▏      | 1330/4182 [54:10<2:07:47,  2.69s/it]

Progress: 1330/4182 | EM: 24.36% | F1: 30.04%


Evaluating:  32%|███▏      | 1340/4182 [54:30<1:52:54,  2.38s/it]

Progress: 1340/4182 | EM: 24.48% | F1: 30.12%


Evaluating:  32%|███▏      | 1350/4182 [54:57<2:08:38,  2.73s/it]

Progress: 1350/4182 | EM: 24.67% | F1: 30.31%


Evaluating:  33%|███▎      | 1360/4182 [55:18<1:47:53,  2.29s/it]

Progress: 1360/4182 | EM: 24.78% | F1: 30.39%


Evaluating:  33%|███▎      | 1370/4182 [55:34<35:26,  1.32it/s]

Progress: 1370/4182 | EM: 25.11% | F1: 30.71%


Evaluating:  33%|███▎      | 1380/4182 [55:49<49:44,  1.07s/it]

Progress: 1380/4182 | EM: 25.22% | F1: 30.78%


Evaluating:  33%|███▎      | 1390/4182 [56:05<1:35:13,  2.05s/it]

Progress: 1390/4182 | EM: 25.47% | F1: 31.03%


Evaluating:  33%|███▎      | 1400/4182 [56:22<1:20:15,  1.73s/it]

Progress: 1400/4182 | EM: 25.57% | F1: 31.12%


Evaluating:  34%|███▎      | 1410/4182 [56:40<1:40:47,  2.18s/it]

Progress: 1410/4182 | EM: 25.96% | F1: 31.49%


Evaluating:  34%|███▍      | 1420/4182 [56:57<1:48:06,  2.35s/it]

Progress: 1420/4182 | EM: 26.48% | F1: 31.97%


Evaluating:  34%|███▍      | 1430/4182 [57:10<30:01,  1.53it/s]

Progress: 1430/4182 | EM: 26.92% | F1: 32.40%


Evaluating:  34%|███▍      | 1440/4182 [57:30<1:34:34,  2.07s/it]

Progress: 1440/4182 | EM: 27.08% | F1: 32.61%


Evaluating:  35%|███▍      | 1450/4182 [57:54<1:36:25,  2.12s/it]

Progress: 1450/4182 | EM: 27.31% | F1: 32.82%


Evaluating:  35%|███▍      | 1460/4182 [58:10<1:02:18,  1.37s/it]

Progress: 1460/4182 | EM: 27.60% | F1: 33.07%


Evaluating:  35%|███▌      | 1470/4182 [58:24<1:30:24,  2.00s/it]

Progress: 1470/4182 | EM: 27.76% | F1: 33.19%


Evaluating:  35%|███▌      | 1480/4182 [58:39<1:31:27,  2.03s/it]

Progress: 1480/4182 | EM: 27.91% | F1: 33.41%


Evaluating:  36%|███▌      | 1490/4182 [59:01<1:52:23,  2.50s/it]

Progress: 1490/4182 | EM: 27.85% | F1: 33.40%


Evaluating:  36%|███▌      | 1500/4182 [59:24<1:51:44,  2.50s/it]

Progress: 1500/4182 | EM: 27.73% | F1: 33.39%


Evaluating:  36%|███▌      | 1510/4182 [59:46<1:25:42,  1.92s/it]

Progress: 1510/4182 | EM: 28.08% | F1: 33.76%


Evaluating:  36%|███▋      | 1520/4182 [1:00:10<1:44:55,  2.36s/it]

Progress: 1520/4182 | EM: 28.09% | F1: 33.74%


Evaluating:  37%|███▋      | 1530/4182 [1:00:28<1:23:33,  1.89s/it]

Progress: 1530/4182 | EM: 28.04% | F1: 33.79%


Evaluating:  37%|███▋      | 1540/4182 [1:00:52<1:47:17,  2.44s/it]

Progress: 1540/4182 | EM: 28.05% | F1: 33.85%


Evaluating:  37%|███▋      | 1550/4182 [1:01:15<1:43:33,  2.36s/it]

Progress: 1550/4182 | EM: 28.00% | F1: 33.86%


Evaluating:  37%|███▋      | 1560/4182 [1:01:39<1:53:12,  2.59s/it]

Progress: 1560/4182 | EM: 28.08% | F1: 33.92%


Evaluating:  38%|███▊      | 1570/4182 [1:01:59<1:02:54,  1.45s/it]

Progress: 1570/4182 | EM: 28.28% | F1: 34.12%


Evaluating:  38%|███▊      | 1580/4182 [1:02:13<1:13:21,  1.69s/it]

Progress: 1580/4182 | EM: 28.35% | F1: 34.22%


Evaluating:  38%|███▊      | 1590/4182 [1:02:30<1:16:41,  1.78s/it]

Progress: 1590/4182 | EM: 28.55% | F1: 34.39%


Evaluating:  38%|███▊      | 1600/4182 [1:02:45<1:25:20,  1.98s/it]

Progress: 1600/4182 | EM: 28.75% | F1: 34.55%


Evaluating:  38%|███▊      | 1610/4182 [1:03:13<1:58:51,  2.77s/it]

Progress: 1610/4182 | EM: 28.76% | F1: 34.55%


Evaluating:  39%|███▊      | 1620/4182 [1:03:41<1:59:26,  2.80s/it]

Progress: 1620/4182 | EM: 28.83% | F1: 34.62%


Evaluating:  39%|███▉      | 1630/4182 [1:04:01<1:27:14,  2.05s/it]

Progress: 1630/4182 | EM: 28.83% | F1: 34.60%


Evaluating:  39%|███▉      | 1640/4182 [1:04:25<1:42:09,  2.41s/it]

Progress: 1640/4182 | EM: 28.90% | F1: 34.64%


Evaluating:  39%|███▉      | 1650/4182 [1:04:58<2:17:23,  3.26s/it]

Progress: 1650/4182 | EM: 28.73% | F1: 34.43%


Evaluating:  40%|███▉      | 1660/4182 [1:05:30<2:16:55,  3.26s/it]

Progress: 1660/4182 | EM: 28.55% | F1: 34.23%


Evaluating:  40%|███▉      | 1670/4182 [1:06:02<2:11:58,  3.15s/it]

Progress: 1670/4182 | EM: 28.38% | F1: 34.03%


Evaluating:  40%|████      | 1680/4182 [1:06:34<2:12:36,  3.18s/it]

Progress: 1680/4182 | EM: 28.21% | F1: 33.83%


Evaluating:  40%|████      | 1690/4182 [1:07:00<1:24:00,  2.02s/it]

Progress: 1690/4182 | EM: 28.11% | F1: 33.74%


Evaluating:  41%|████      | 1700/4182 [1:07:25<1:46:34,  2.58s/it]

Progress: 1700/4182 | EM: 27.94% | F1: 33.61%


Evaluating:  41%|████      | 1710/4182 [1:07:48<1:49:04,  2.65s/it]

Progress: 1710/4182 | EM: 27.95% | F1: 33.59%


Evaluating:  41%|████      | 1720/4182 [1:08:08<1:41:47,  2.48s/it]

Progress: 1720/4182 | EM: 28.14% | F1: 33.74%


Evaluating:  41%|████▏     | 1730/4182 [1:08:38<2:05:47,  3.08s/it]

Progress: 1730/4182 | EM: 27.98% | F1: 33.55%


Evaluating:  42%|████▏     | 1740/4182 [1:09:10<2:07:43,  3.14s/it]

Progress: 1740/4182 | EM: 27.82% | F1: 33.36%


Evaluating:  42%|████▏     | 1750/4182 [1:09:42<2:09:20,  3.19s/it]

Progress: 1750/4182 | EM: 27.66% | F1: 33.20%


Evaluating:  42%|████▏     | 1760/4182 [1:10:11<2:04:15,  3.08s/it]

Progress: 1760/4182 | EM: 27.50% | F1: 33.02%


Evaluating:  42%|████▏     | 1770/4182 [1:10:42<2:03:52,  3.08s/it]

Progress: 1770/4182 | EM: 27.34% | F1: 32.87%


Evaluating:  43%|████▎     | 1780/4182 [1:11:13<2:03:42,  3.09s/it]

Progress: 1780/4182 | EM: 27.19% | F1: 32.69%


Evaluating:  43%|████▎     | 1790/4182 [1:11:44<2:03:29,  3.10s/it]

Progress: 1790/4182 | EM: 27.09% | F1: 32.57%


Evaluating:  43%|████▎     | 1800/4182 [1:12:15<2:02:34,  3.09s/it]

Progress: 1800/4182 | EM: 26.94% | F1: 32.39%


Evaluating:  43%|████▎     | 1810/4182 [1:12:41<1:39:57,  2.53s/it]

Progress: 1810/4182 | EM: 26.85% | F1: 32.34%


Evaluating:  44%|████▎     | 1820/4182 [1:13:01<1:26:59,  2.21s/it]

Progress: 1820/4182 | EM: 26.87% | F1: 32.37%


Evaluating:  44%|████▍     | 1830/4182 [1:13:16<35:02,  1.12it/s]

Progress: 1830/4182 | EM: 26.83% | F1: 32.50%


Evaluating:  44%|████▍     | 1840/4182 [1:13:33<1:18:16,  2.01s/it]

Progress: 1840/4182 | EM: 26.90% | F1: 32.56%


Evaluating:  44%|████▍     | 1850/4182 [1:13:58<1:53:41,  2.93s/it]

Progress: 1850/4182 | EM: 26.76% | F1: 32.40%


Evaluating:  44%|████▍     | 1860/4182 [1:14:29<2:01:58,  3.15s/it]

Progress: 1860/4182 | EM: 26.61% | F1: 32.23%


Evaluating:  45%|████▍     | 1870/4182 [1:15:01<2:00:52,  3.14s/it]

Progress: 1870/4182 | EM: 26.47% | F1: 32.06%


Evaluating:  45%|████▍     | 1880/4182 [1:15:32<2:00:32,  3.14s/it]

Progress: 1880/4182 | EM: 26.33% | F1: 31.91%


Evaluating:  45%|████▌     | 1890/4182 [1:16:04<2:01:06,  3.17s/it]

Progress: 1890/4182 | EM: 26.19% | F1: 31.78%


Evaluating:  45%|████▌     | 1900/4182 [1:16:36<2:00:31,  3.17s/it]

Progress: 1900/4182 | EM: 26.05% | F1: 31.63%


Evaluating:  46%|████▌     | 1910/4182 [1:17:07<1:59:49,  3.16s/it]

Progress: 1910/4182 | EM: 25.92% | F1: 31.47%


Evaluating:  46%|████▌     | 1920/4182 [1:17:39<1:59:12,  3.16s/it]

Progress: 1920/4182 | EM: 25.78% | F1: 31.34%


Evaluating:  46%|████▌     | 1930/4182 [1:18:07<1:23:09,  2.22s/it]

Progress: 1930/4182 | EM: 25.70% | F1: 31.26%


Evaluating:  46%|████▋     | 1940/4182 [1:18:25<1:23:42,  2.24s/it]

Progress: 1940/4182 | EM: 25.77% | F1: 31.33%


Evaluating:  47%|████▋     | 1950/4182 [1:18:45<56:12,  1.51s/it]  

Progress: 1950/4182 | EM: 25.90% | F1: 31.43%


Evaluating:  47%|████▋     | 1960/4182 [1:19:02<57:42,  1.56s/it]

Progress: 1960/4182 | EM: 26.07% | F1: 31.58%


Evaluating:  47%|████▋     | 1970/4182 [1:19:15<55:24,  1.50s/it]

Progress: 1970/4182 | EM: 26.04% | F1: 31.52%


Evaluating:  47%|████▋     | 1980/4182 [1:19:37<1:13:18,  2.00s/it]

Progress: 1980/4182 | EM: 26.26% | F1: 31.76%


Evaluating:  48%|████▊     | 1990/4182 [1:19:52<38:52,  1.06s/it]

Progress: 1990/4182 | EM: 26.43% | F1: 31.93%


Evaluating:  48%|████▊     | 2000/4182 [1:20:06<51:11,  1.41s/it]  

Progress: 2000/4182 | EM: 26.55% | F1: 32.05%


Evaluating:  48%|████▊     | 2010/4182 [1:20:27<1:16:40,  2.12s/it]

Progress: 2010/4182 | EM: 26.77% | F1: 32.24%


Evaluating:  48%|████▊     | 2020/4182 [1:20:50<1:19:49,  2.22s/it]

Progress: 2020/4182 | EM: 26.73% | F1: 32.19%


Evaluating:  49%|████▊     | 2030/4182 [1:21:10<1:13:46,  2.06s/it]

Progress: 2030/4182 | EM: 26.75% | F1: 32.19%


Evaluating:  49%|████▉     | 2040/4182 [1:21:29<37:42,  1.06s/it]

Progress: 2040/4182 | EM: 26.81% | F1: 32.25%


Evaluating:  49%|████▉     | 2050/4182 [1:21:53<1:30:43,  2.55s/it]

Progress: 2050/4182 | EM: 26.83% | F1: 32.25%


Evaluating:  49%|████▉     | 2060/4182 [1:22:21<1:38:45,  2.79s/it]

Progress: 2060/4182 | EM: 26.80% | F1: 32.24%


Evaluating:  49%|████▉     | 2070/4182 [1:22:46<1:31:09,  2.59s/it]

Progress: 2070/4182 | EM: 26.71% | F1: 32.17%


Evaluating:  50%|████▉     | 2080/4182 [1:23:11<1:09:13,  1.98s/it]

Progress: 2080/4182 | EM: 26.68% | F1: 32.13%


Evaluating:  50%|████▉     | 2090/4182 [1:23:34<1:22:33,  2.37s/it]

Progress: 2090/4182 | EM: 26.70% | F1: 32.14%


Evaluating:  50%|█████     | 2100/4182 [1:23:56<1:01:30,  1.77s/it]

Progress: 2100/4182 | EM: 26.62% | F1: 32.07%


Evaluating:  50%|█████     | 2110/4182 [1:24:21<1:27:18,  2.53s/it]

Progress: 2110/4182 | EM: 26.64% | F1: 32.09%


Evaluating:  51%|█████     | 2120/4182 [1:24:44<1:04:27,  1.88s/it]

Progress: 2120/4182 | EM: 26.70% | F1: 32.13%


Evaluating:  51%|█████     | 2130/4182 [1:24:59<1:11:33,  2.09s/it]

Progress: 2130/4182 | EM: 26.76% | F1: 32.17%


Evaluating:  51%|█████     | 2140/4182 [1:25:24<1:26:44,  2.55s/it]

Progress: 2140/4182 | EM: 26.78% | F1: 32.18%


Evaluating:  51%|█████▏    | 2150/4182 [1:25:46<1:13:10,  2.16s/it]

Progress: 2150/4182 | EM: 26.79% | F1: 32.20%


Evaluating:  52%|█████▏    | 2160/4182 [1:26:03<59:23,  1.76s/it]

Progress: 2160/4182 | EM: 26.94% | F1: 32.36%


Evaluating:  52%|█████▏    | 2170/4182 [1:26:18<50:26,  1.50s/it]

Progress: 2170/4182 | EM: 27.00% | F1: 32.43%


Evaluating:  52%|█████▏    | 2180/4182 [1:26:37<1:06:16,  1.99s/it]

Progress: 2180/4182 | EM: 27.06% | F1: 32.48%


Evaluating:  52%|█████▏    | 2190/4182 [1:26:57<46:04,  1.39s/it]

Progress: 2190/4182 | EM: 27.26% | F1: 32.65%


Evaluating:  53%|█████▎    | 2200/4182 [1:27:06<40:51,  1.24s/it]

Progress: 2200/4182 | EM: 27.45% | F1: 32.87%


Evaluating:  53%|█████▎    | 2210/4182 [1:27:18<31:56,  1.03it/s]

Progress: 2210/4182 | EM: 27.69% | F1: 33.09%


Evaluating:  53%|█████▎    | 2220/4182 [1:27:42<1:21:41,  2.50s/it]

Progress: 2220/4182 | EM: 27.75% | F1: 33.16%


Evaluating:  53%|█████▎    | 2230/4182 [1:28:07<1:23:28,  2.57s/it]

Progress: 2230/4182 | EM: 27.85% | F1: 33.27%


Evaluating:  54%|█████▎    | 2240/4182 [1:28:33<1:23:12,  2.57s/it]

Progress: 2240/4182 | EM: 27.86% | F1: 33.26%


Evaluating:  54%|█████▍    | 2250/4182 [1:28:57<1:22:20,  2.56s/it]

Progress: 2250/4182 | EM: 28.04% | F1: 33.42%


Evaluating:  54%|█████▍    | 2261/4182 [1:29:24<1:02:14,  1.94s/it]

Progress: 2260/4182 | EM: 28.05% | F1: 33.44%


Evaluating:  54%|█████▍    | 2270/4182 [1:29:40<53:46,  1.69s/it]

Progress: 2270/4182 | EM: 28.19% | F1: 33.57%


Evaluating:  55%|█████▍    | 2281/4182 [1:29:52<26:37,  1.19it/s]

Progress: 2280/4182 | EM: 28.51% | F1: 33.86%


Evaluating:  55%|█████▍    | 2290/4182 [1:30:05<34:56,  1.11s/it]

Progress: 2290/4182 | EM: 28.73% | F1: 34.08%


Evaluating:  55%|█████▍    | 2300/4182 [1:30:29<1:26:50,  2.77s/it]

Progress: 2300/4182 | EM: 28.78% | F1: 34.11%


Evaluating:  55%|█████▌    | 2310/4182 [1:31:00<1:36:07,  3.08s/it]

Progress: 2310/4182 | EM: 28.66% | F1: 33.96%


Evaluating:  55%|█████▌    | 2320/4182 [1:31:31<1:36:07,  3.10s/it]

Progress: 2320/4182 | EM: 28.53% | F1: 33.82%


Evaluating:  56%|█████▌    | 2330/4182 [1:32:02<1:35:28,  3.09s/it]

Progress: 2330/4182 | EM: 28.41% | F1: 33.67%


Evaluating:  56%|█████▌    | 2340/4182 [1:32:33<1:38:40,  3.21s/it]

Progress: 2340/4182 | EM: 28.29% | F1: 33.53%


Evaluating:  56%|█████▌    | 2350/4182 [1:33:06<1:39:14,  3.25s/it]

Progress: 2350/4182 | EM: 28.17% | F1: 33.40%


Evaluating:  56%|█████▋    | 2360/4182 [1:33:36<1:33:11,  3.07s/it]

Progress: 2360/4182 | EM: 28.05% | F1: 33.26%


Evaluating:  57%|█████▋    | 2370/4182 [1:34:07<1:33:16,  3.09s/it]

Progress: 2370/4182 | EM: 27.93% | F1: 33.12%


Evaluating:  57%|█████▋    | 2380/4182 [1:34:38<1:29:38,  2.98s/it]

Progress: 2380/4182 | EM: 27.82% | F1: 32.99%


Evaluating:  57%|█████▋    | 2390/4182 [1:35:04<1:23:55,  2.81s/it]

Progress: 2390/4182 | EM: 27.87% | F1: 33.02%


Evaluating:  57%|█████▋    | 2400/4182 [1:35:30<1:18:38,  2.65s/it]

Progress: 2400/4182 | EM: 27.83% | F1: 33.02%


Evaluating:  58%|█████▊    | 2410/4182 [1:35:53<45:55,  1.56s/it]  

Progress: 2410/4182 | EM: 27.88% | F1: 33.08%


Evaluating:  58%|█████▊    | 2420/4182 [1:36:21<1:21:34,  2.78s/it]

Progress: 2420/4182 | EM: 27.77% | F1: 33.00%


Evaluating:  58%|█████▊    | 2430/4182 [1:36:42<56:30,  1.94s/it]

Progress: 2430/4182 | EM: 27.78% | F1: 33.04%


Evaluating:  58%|█████▊    | 2440/4182 [1:37:04<51:36,  1.78s/it]

Progress: 2440/4182 | EM: 27.83% | F1: 33.09%


Evaluating:  59%|█████▊    | 2450/4182 [1:37:28<1:10:13,  2.43s/it]

Progress: 2450/4182 | EM: 27.92% | F1: 33.19%


Evaluating:  59%|█████▉    | 2460/4182 [1:37:47<52:00,  1.81s/it]  

Progress: 2460/4182 | EM: 27.97% | F1: 33.23%


Evaluating:  59%|█████▉    | 2470/4182 [1:38:05<51:09,  1.79s/it]

Progress: 2470/4182 | EM: 28.06% | F1: 33.32%


Evaluating:  59%|█████▉    | 2480/4182 [1:38:14<14:55,  1.90it/s]

Progress: 2480/4182 | EM: 28.31% | F1: 33.57%


Evaluating:  60%|█████▉    | 2490/4182 [1:38:22<12:39,  2.23it/s]

Progress: 2490/4182 | EM: 28.55% | F1: 33.80%


Evaluating:  60%|█████▉    | 2500/4182 [1:38:27<12:07,  2.31it/s]

Progress: 2500/4182 | EM: 28.80% | F1: 34.04%


Evaluating:  60%|██████    | 2510/4182 [1:38:58<1:27:01,  3.12s/it]

Progress: 2510/4182 | EM: 28.73% | F1: 33.95%


Evaluating:  60%|██████    | 2520/4182 [1:39:30<1:27:23,  3.15s/it]

Progress: 2520/4182 | EM: 28.61% | F1: 33.83%


Evaluating:  60%|██████    | 2530/4182 [1:40:01<1:26:20,  3.14s/it]

Progress: 2530/4182 | EM: 28.50% | F1: 33.73%


Evaluating:  61%|██████    | 2540/4182 [1:40:33<1:25:48,  3.14s/it]

Progress: 2540/4182 | EM: 28.39% | F1: 33.60%


Evaluating:  61%|██████    | 2550/4182 [1:40:57<1:03:14,  2.33s/it]

Progress: 2550/4182 | EM: 28.27% | F1: 33.53%


Evaluating:  61%|██████    | 2560/4182 [1:41:24<1:12:06,  2.67s/it]

Progress: 2560/4182 | EM: 28.32% | F1: 33.57%


Evaluating:  61%|██████▏   | 2570/4182 [1:41:43<54:50,  2.04s/it]

Progress: 2570/4182 | EM: 28.25% | F1: 33.49%


Evaluating:  62%|██████▏   | 2580/4182 [1:42:06<42:00,  1.57s/it]

Progress: 2580/4182 | EM: 28.22% | F1: 33.45%


Evaluating:  62%|██████▏   | 2590/4182 [1:42:30<1:04:26,  2.43s/it]

Progress: 2590/4182 | EM: 28.22% | F1: 33.47%


Evaluating:  62%|██████▏   | 2600/4182 [1:42:47<1:01:49,  2.34s/it]

Progress: 2600/4182 | EM: 28.38% | F1: 33.62%


Evaluating:  62%|██████▏   | 2610/4182 [1:43:04<49:18,  1.88s/it]

Progress: 2610/4182 | EM: 28.58% | F1: 33.80%


Evaluating:  63%|██████▎   | 2620/4182 [1:43:19<57:57,  2.23s/it]

Progress: 2620/4182 | EM: 28.78% | F1: 33.99%


Evaluating:  63%|██████▎   | 2630/4182 [1:43:45<1:09:56,  2.70s/it]

Progress: 2630/4182 | EM: 28.82% | F1: 34.04%


Evaluating:  63%|██████▎   | 2640/4182 [1:44:12<1:10:24,  2.74s/it]

Progress: 2640/4182 | EM: 28.71% | F1: 33.95%


Evaluating:  63%|██████▎   | 2650/4182 [1:44:35<58:34,  2.29s/it]

Progress: 2650/4182 | EM: 28.72% | F1: 33.94%


Evaluating:  64%|██████▎   | 2660/4182 [1:45:00<1:01:15,  2.42s/it]

Progress: 2660/4182 | EM: 28.68% | F1: 33.91%


Evaluating:  64%|██████▍   | 2670/4182 [1:45:28<1:12:49,  2.89s/it]

Progress: 2670/4182 | EM: 28.58% | F1: 33.82%


Evaluating:  64%|██████▍   | 2680/4182 [1:45:52<1:05:26,  2.61s/it]

Progress: 2680/4182 | EM: 28.58% | F1: 33.84%


Evaluating:  64%|██████▍   | 2690/4182 [1:46:13<1:04:34,  2.60s/it]

Progress: 2690/4182 | EM: 28.59% | F1: 33.86%


Evaluating:  65%|██████▍   | 2700/4182 [1:46:39<1:10:15,  2.84s/it]

Progress: 2700/4182 | EM: 28.52% | F1: 33.78%


Evaluating:  65%|██████▍   | 2710/4182 [1:47:06<1:13:39,  3.00s/it]

Progress: 2710/4182 | EM: 28.52% | F1: 33.77%


Evaluating:  65%|██████▌   | 2720/4182 [1:47:38<1:15:57,  3.12s/it]

Progress: 2720/4182 | EM: 28.42% | F1: 33.66%


Evaluating:  65%|██████▌   | 2730/4182 [1:48:09<1:15:23,  3.12s/it]

Progress: 2730/4182 | EM: 28.32% | F1: 33.57%


Evaluating:  66%|██████▌   | 2740/4182 [1:48:40<1:15:52,  3.16s/it]

Progress: 2740/4182 | EM: 28.21% | F1: 33.46%


Evaluating:  66%|██████▌   | 2750/4182 [1:49:09<57:22,  2.40s/it]  

Progress: 2750/4182 | EM: 28.11% | F1: 33.35%


Evaluating:  66%|██████▌   | 2760/4182 [1:49:35<1:03:20,  2.67s/it]

Progress: 2760/4182 | EM: 28.15% | F1: 33.43%


Evaluating:  66%|██████▌   | 2770/4182 [1:49:53<35:42,  1.52s/it]

Progress: 2770/4182 | EM: 28.09% | F1: 33.39%


Evaluating:  66%|██████▋   | 2780/4182 [1:50:16<53:57,  2.31s/it]

Progress: 2780/4182 | EM: 28.17% | F1: 33.46%


Evaluating:  67%|██████▋   | 2790/4182 [1:50:38<42:31,  1.83s/it]

Progress: 2790/4182 | EM: 28.17% | F1: 33.46%


Evaluating:  67%|██████▋   | 2800/4182 [1:51:02<1:00:25,  2.62s/it]

Progress: 2800/4182 | EM: 28.18% | F1: 33.51%


Evaluating:  67%|██████▋   | 2810/4182 [1:51:27<54:39,  2.39s/it]

Progress: 2810/4182 | EM: 28.22% | F1: 33.53%


Evaluating:  67%|██████▋   | 2820/4182 [1:51:50<48:00,  2.11s/it]

Progress: 2820/4182 | EM: 28.23% | F1: 33.54%


Evaluating:  68%|██████▊   | 2830/4182 [1:52:15<56:40,  2.52s/it]

Progress: 2830/4182 | EM: 28.20% | F1: 33.50%


Evaluating:  68%|██████▊   | 2840/4182 [1:52:37<50:49,  2.27s/it]

Progress: 2840/4182 | EM: 28.13% | F1: 33.44%


Evaluating:  68%|██████▊   | 2850/4182 [1:52:55<39:38,  1.79s/it]

Progress: 2850/4182 | EM: 28.11% | F1: 33.40%


Evaluating:  68%|██████▊   | 2860/4182 [1:53:13<44:07,  2.00s/it]

Progress: 2860/4182 | EM: 28.18% | F1: 33.45%


Evaluating:  69%|██████▊   | 2870/4182 [1:53:33<38:35,  1.76s/it]

Progress: 2870/4182 | EM: 28.26% | F1: 33.56%


Evaluating:  69%|██████▉   | 2880/4182 [1:54:03<1:02:49,  2.89s/it]

Progress: 2880/4182 | EM: 28.16% | F1: 33.44%


Evaluating:  69%|██████▉   | 2890/4182 [1:54:34<1:06:15,  3.08s/it]

Progress: 2890/4182 | EM: 28.06% | F1: 33.34%


Evaluating:  69%|██████▉   | 2900/4182 [1:55:05<1:06:09,  3.10s/it]

Progress: 2900/4182 | EM: 27.97% | F1: 33.23%


Evaluating:  70%|██████▉   | 2910/4182 [1:55:35<1:05:14,  3.08s/it]

Progress: 2910/4182 | EM: 27.87% | F1: 33.12%


Evaluating:  70%|██████▉   | 2920/4182 [1:55:58<45:05,  2.14s/it]

Progress: 2920/4182 | EM: 27.81% | F1: 33.06%


Evaluating:  70%|███████   | 2930/4182 [1:56:18<47:13,  2.26s/it]

Progress: 2930/4182 | EM: 27.85% | F1: 33.11%


Evaluating:  70%|███████   | 2940/4182 [1:56:38<29:02,  1.40s/it]

Progress: 2940/4182 | EM: 27.86% | F1: 33.12%


Evaluating:  71%|███████   | 2950/4182 [1:56:54<28:27,  1.39s/it]

Progress: 2950/4182 | EM: 27.97% | F1: 33.25%


Evaluating:  71%|███████   | 2960/4182 [1:57:15<47:19,  2.32s/it]

Progress: 2960/4182 | EM: 27.94% | F1: 33.25%


Evaluating:  71%|███████   | 2970/4182 [1:57:36<43:31,  2.15s/it]

Progress: 2970/4182 | EM: 27.98% | F1: 33.28%


Evaluating:  71%|███████▏  | 2980/4182 [1:57:58<43:10,  2.16s/it]

Progress: 2980/4182 | EM: 27.92% | F1: 33.21%


Evaluating:  71%|███████▏  | 2990/4182 [1:58:21<43:16,  2.18s/it]

Progress: 2990/4182 | EM: 27.96% | F1: 33.25%


Evaluating:  72%|███████▏  | 3000/4182 [1:58:51<1:00:42,  3.08s/it]

Progress: 3000/4182 | EM: 27.93% | F1: 33.21%


Evaluating:  72%|███████▏  | 3010/4182 [1:59:22<1:01:16,  3.14s/it]

Progress: 3010/4182 | EM: 27.84% | F1: 33.11%


Evaluating:  72%|███████▏  | 3020/4182 [1:59:54<1:00:44,  3.14s/it]

Progress: 3020/4182 | EM: 27.75% | F1: 33.01%


Evaluating:  72%|███████▏  | 3030/4182 [2:00:25<1:00:03,  3.13s/it]

Progress: 3030/4182 | EM: 27.66% | F1: 32.91%


Evaluating:  73%|███████▎  | 3040/4182 [2:00:51<47:16,  2.48s/it]

Progress: 3040/4182 | EM: 27.60% | F1: 32.84%


Evaluating:  73%|███████▎  | 3050/4182 [2:01:05<19:27,  1.03s/it]

Progress: 3050/4182 | EM: 27.70% | F1: 32.94%


Evaluating:  73%|███████▎  | 3060/4182 [2:01:19<28:59,  1.55s/it]

Progress: 3060/4182 | EM: 27.81% | F1: 33.05%


Evaluating:  73%|███████▎  | 3070/4182 [2:01:39<39:34,  2.14s/it]

Progress: 3070/4182 | EM: 27.95% | F1: 33.20%


Evaluating:  74%|███████▎  | 3080/4182 [2:02:02<52:48,  2.88s/it]

Progress: 3080/4182 | EM: 27.92% | F1: 33.17%


Evaluating:  74%|███████▍  | 3090/4182 [2:02:33<55:46,  3.06s/it]

Progress: 3090/4182 | EM: 27.86% | F1: 33.10%


Evaluating:  74%|███████▍  | 3100/4182 [2:03:04<55:02,  3.05s/it]

Progress: 3100/4182 | EM: 27.77% | F1: 33.01%


Evaluating:  74%|███████▍  | 3110/4182 [2:03:34<55:07,  3.09s/it]

Progress: 3110/4182 | EM: 27.68% | F1: 32.91%


Evaluating:  75%|███████▍  | 3120/4182 [2:04:05<54:47,  3.10s/it]

Progress: 3120/4182 | EM: 27.60% | F1: 32.82%


Evaluating:  75%|███████▍  | 3130/4182 [2:04:36<54:15,  3.09s/it]

Progress: 3130/4182 | EM: 27.51% | F1: 32.72%


Evaluating:  75%|███████▌  | 3140/4182 [2:05:08<54:44,  3.15s/it]

Progress: 3140/4182 | EM: 27.42% | F1: 32.63%


Evaluating:  75%|███████▌  | 3150/4182 [2:05:39<53:15,  3.10s/it]

Progress: 3150/4182 | EM: 27.33% | F1: 32.53%


Evaluating:  76%|███████▌  | 3160/4182 [2:06:10<53:31,  3.14s/it]

Progress: 3160/4182 | EM: 27.25% | F1: 32.43%


Evaluating:  76%|███████▌  | 3170/4182 [2:06:41<53:29,  3.17s/it]

Progress: 3170/4182 | EM: 27.16% | F1: 32.33%


Evaluating:  76%|███████▌  | 3180/4182 [2:07:13<52:54,  3.17s/it]

Progress: 3180/4182 | EM: 27.11% | F1: 32.27%


Evaluating:  76%|███████▋  | 3190/4182 [2:07:44<52:40,  3.19s/it]

Progress: 3190/4182 | EM: 27.02% | F1: 32.17%


Evaluating:  77%|███████▋  | 3200/4182 [2:08:16<50:40,  3.10s/it]

Progress: 3200/4182 | EM: 26.94% | F1: 32.08%


Evaluating:  77%|███████▋  | 3210/4182 [2:08:46<50:06,  3.09s/it]

Progress: 3210/4182 | EM: 26.85% | F1: 31.98%


Evaluating:  77%|███████▋  | 3220/4182 [2:09:17<49:34,  3.09s/it]

Progress: 3220/4182 | EM: 26.77% | F1: 31.88%


Evaluating:  77%|███████▋  | 3230/4182 [2:09:48<49:02,  3.09s/it]

Progress: 3230/4182 | EM: 26.69% | F1: 31.79%


Evaluating:  77%|███████▋  | 3240/4182 [2:10:20<49:33,  3.16s/it]

Progress: 3240/4182 | EM: 26.60% | F1: 31.70%


Evaluating:  78%|███████▊  | 3250/4182 [2:10:52<50:34,  3.26s/it]

Progress: 3250/4182 | EM: 26.52% | F1: 31.62%


Evaluating:  78%|███████▊  | 3260/4182 [2:11:25<50:30,  3.29s/it]

Progress: 3260/4182 | EM: 26.44% | F1: 31.54%


Evaluating:  78%|███████▊  | 3270/4182 [2:11:57<48:43,  3.21s/it]

Progress: 3270/4182 | EM: 26.36% | F1: 31.45%


Evaluating:  78%|███████▊  | 3280/4182 [2:12:30<49:24,  3.29s/it]

Progress: 3280/4182 | EM: 26.28% | F1: 31.37%


Evaluating:  79%|███████▊  | 3290/4182 [2:12:51<27:43,  1.87s/it]

Progress: 3290/4182 | EM: 26.23% | F1: 31.34%


Evaluating:  79%|███████▉  | 3300/4182 [2:13:11<32:21,  2.20s/it]

Progress: 3300/4182 | EM: 26.27% | F1: 31.37%


Evaluating:  79%|███████▉  | 3310/4182 [2:13:28<23:20,  1.61s/it]

Progress: 3310/4182 | EM: 26.37% | F1: 31.46%


Evaluating:  79%|███████▉  | 3320/4182 [2:13:46<33:37,  2.34s/it]

Progress: 3320/4182 | EM: 26.45% | F1: 31.52%


Evaluating:  80%|███████▉  | 3330/4182 [2:14:14<43:43,  3.08s/it]

Progress: 3330/4182 | EM: 26.40% | F1: 31.46%


Evaluating:  80%|███████▉  | 3340/4182 [2:14:46<44:22,  3.16s/it]

Progress: 3340/4182 | EM: 26.32% | F1: 31.37%


Evaluating:  80%|████████  | 3350/4182 [2:15:17<43:05,  3.11s/it]

Progress: 3350/4182 | EM: 26.24% | F1: 31.29%


Evaluating:  80%|████████  | 3360/4182 [2:15:48<42:32,  3.11s/it]

Progress: 3360/4182 | EM: 26.16% | F1: 31.21%


Evaluating:  81%|████████  | 3370/4182 [2:16:12<29:29,  2.18s/it]

Progress: 3370/4182 | EM: 26.11% | F1: 31.18%


Evaluating:  81%|████████  | 3380/4182 [2:16:39<35:53,  2.68s/it]

Progress: 3380/4182 | EM: 26.12% | F1: 31.21%


Evaluating:  81%|████████  | 3390/4182 [2:17:02<25:56,  1.97s/it]

Progress: 3390/4182 | EM: 26.14% | F1: 31.23%


Evaluating:  81%|████████▏ | 3400/4182 [2:17:27<33:53,  2.60s/it]

Progress: 3400/4182 | EM: 26.15% | F1: 31.23%


Evaluating:  82%|████████▏ | 3410/4182 [2:17:43<27:20,  2.12s/it]

Progress: 3410/4182 | EM: 26.07% | F1: 31.17%


Evaluating:  82%|████████▏ | 3420/4182 [2:18:04<22:39,  1.78s/it]

Progress: 3420/4182 | EM: 26.08% | F1: 31.17%


Evaluating:  82%|████████▏ | 3430/4182 [2:18:29<26:33,  2.12s/it]

Progress: 3430/4182 | EM: 26.06% | F1: 31.15%


Evaluating:  82%|████████▏ | 3440/4182 [2:18:50<23:23,  1.89s/it]

Progress: 3440/4182 | EM: 26.10% | F1: 31.20%


Evaluating:  82%|████████▏ | 3450/4182 [2:19:12<22:56,  1.88s/it]

Progress: 3450/4182 | EM: 26.14% | F1: 31.24%


Evaluating:  83%|████████▎ | 3460/4182 [2:19:39<32:32,  2.70s/it]

Progress: 3460/4182 | EM: 26.16% | F1: 31.24%


Evaluating:  83%|████████▎ | 3470/4182 [2:20:02<26:07,  2.20s/it]

Progress: 3470/4182 | EM: 26.14% | F1: 31.21%


Evaluating:  83%|████████▎ | 3480/4182 [2:20:22<21:03,  1.80s/it]

Progress: 3480/4182 | EM: 26.18% | F1: 31.27%


Evaluating:  83%|████████▎ | 3490/4182 [2:20:45<24:41,  2.14s/it]

Progress: 3490/4182 | EM: 26.13% | F1: 31.21%


Evaluating:  84%|████████▎ | 3500/4182 [2:21:06<25:06,  2.21s/it]

Progress: 3500/4182 | EM: 26.20% | F1: 31.28%


Evaluating:  84%|████████▍ | 3511/4182 [2:21:23<11:32,  1.03s/it]

Progress: 3510/4182 | EM: 26.30% | F1: 31.38%


Evaluating:  84%|████████▍ | 3520/4182 [2:21:37<22:21,  2.03s/it]

Progress: 3520/4182 | EM: 26.39% | F1: 31.46%


Evaluating:  84%|████████▍ | 3530/4182 [2:22:00<26:03,  2.40s/it]

Progress: 3530/4182 | EM: 26.40% | F1: 31.48%


Evaluating:  85%|████████▍ | 3540/4182 [2:22:17<17:14,  1.61s/it]

Progress: 3540/4182 | EM: 26.38% | F1: 31.48%


Evaluating:  85%|████████▍ | 3550/4182 [2:22:45<28:44,  2.73s/it]

Progress: 3550/4182 | EM: 26.39% | F1: 31.48%


Evaluating:  85%|████████▌ | 3560/4182 [2:23:04<20:45,  2.00s/it]

Progress: 3560/4182 | EM: 26.40% | F1: 31.49%


Evaluating:  85%|████████▌ | 3570/4182 [2:23:31<24:47,  2.43s/it]

Progress: 3570/4182 | EM: 26.44% | F1: 31.53%


Evaluating:  86%|████████▌ | 3580/4182 [2:23:38<03:46,  2.65it/s]

Progress: 3580/4182 | EM: 26.59% | F1: 31.68%


Evaluating:  86%|████████▌ | 3590/4182 [2:23:45<05:20,  1.85it/s]

Progress: 3590/4182 | EM: 26.80% | F1: 31.87%


Evaluating:  86%|████████▌ | 3600/4182 [2:23:48<02:33,  3.79it/s]

Progress: 3600/4182 | EM: 27.00% | F1: 32.06%


Evaluating:  86%|████████▋ | 3610/4182 [2:24:03<18:03,  1.89s/it]

Progress: 3610/4182 | EM: 27.09% | F1: 32.14%


Evaluating:  87%|████████▋ | 3620/4182 [2:24:34<29:05,  3.11s/it]

Progress: 3620/4182 | EM: 27.02% | F1: 32.05%


Evaluating:  87%|████████▋ | 3630/4182 [2:25:06<28:38,  3.11s/it]

Progress: 3630/4182 | EM: 26.94% | F1: 31.97%


Evaluating:  87%|████████▋ | 3640/4182 [2:25:37<28:12,  3.12s/it]

Progress: 3640/4182 | EM: 26.87% | F1: 31.89%


Evaluating:  87%|████████▋ | 3650/4182 [2:26:08<27:14,  3.07s/it]

Progress: 3650/4182 | EM: 26.79% | F1: 31.81%


Evaluating:  88%|████████▊ | 3660/4182 [2:26:37<25:45,  2.96s/it]

Progress: 3660/4182 | EM: 26.72% | F1: 31.75%


Evaluating:  88%|████████▊ | 3670/4182 [2:27:07<25:12,  2.95s/it]

Progress: 3670/4182 | EM: 26.70% | F1: 31.72%


Evaluating:  88%|████████▊ | 3680/4182 [2:27:36<24:39,  2.95s/it]

Progress: 3680/4182 | EM: 26.66% | F1: 31.67%


Evaluating:  88%|████████▊ | 3690/4182 [2:28:06<24:14,  2.96s/it]

Progress: 3690/4182 | EM: 26.61% | F1: 31.63%


Evaluating:  88%|████████▊ | 3700/4182 [2:28:25<17:48,  2.22s/it]

Progress: 3700/4182 | EM: 26.59% | F1: 31.62%


Evaluating:  89%|████████▊ | 3710/4182 [2:28:46<14:07,  1.80s/it]

Progress: 3710/4182 | EM: 26.60% | F1: 31.64%


Evaluating:  89%|████████▉ | 3720/4182 [2:29:05<16:36,  2.16s/it]

Progress: 3720/4182 | EM: 26.72% | F1: 31.75%


Evaluating:  89%|████████▉ | 3730/4182 [2:29:28<17:37,  2.34s/it]

Progress: 3730/4182 | EM: 26.73% | F1: 31.75%


Evaluating:  89%|████████▉ | 3740/4182 [2:29:56<22:37,  3.07s/it]

Progress: 3740/4182 | EM: 26.66% | F1: 31.68%


Evaluating:  90%|████████▉ | 3750/4182 [2:30:28<22:34,  3.14s/it]

Progress: 3750/4182 | EM: 26.59% | F1: 31.61%


Evaluating:  90%|████████▉ | 3760/4182 [2:30:59<21:51,  3.11s/it]

Progress: 3760/4182 | EM: 26.52% | F1: 31.53%


Evaluating:  90%|█████████ | 3770/4182 [2:31:30<21:24,  3.12s/it]

Progress: 3770/4182 | EM: 26.45% | F1: 31.45%


Evaluating:  90%|█████████ | 3780/4182 [2:31:57<16:53,  2.52s/it]

Progress: 3780/4182 | EM: 26.40% | F1: 31.40%


Evaluating:  91%|█████████ | 3790/4182 [2:32:23<17:07,  2.62s/it]

Progress: 3790/4182 | EM: 26.44% | F1: 31.45%


Evaluating:  91%|█████████ | 3800/4182 [2:32:47<15:20,  2.41s/it]

Progress: 3800/4182 | EM: 26.42% | F1: 31.43%


Evaluating:  91%|█████████ | 3810/4182 [2:33:16<17:53,  2.89s/it]

Progress: 3810/4182 | EM: 26.43% | F1: 31.43%


Evaluating:  91%|█████████▏| 3820/4182 [2:33:38<10:58,  1.82s/it]

Progress: 3820/4182 | EM: 26.49% | F1: 31.48%


Evaluating:  92%|█████████▏| 3830/4182 [2:33:51<06:12,  1.06s/it]

Progress: 3830/4182 | EM: 26.61% | F1: 31.58%


Evaluating:  92%|█████████▏| 3840/4182 [2:34:08<07:12,  1.26s/it]

Progress: 3840/4182 | EM: 26.72% | F1: 31.68%


Evaluating:  92%|█████████▏| 3850/4182 [2:34:25<08:37,  1.56s/it]

Progress: 3850/4182 | EM: 26.88% | F1: 31.84%


Evaluating:  92%|█████████▏| 3860/4182 [2:34:44<11:20,  2.11s/it]

Progress: 3860/4182 | EM: 26.94% | F1: 31.90%


Evaluating:  93%|█████████▎| 3870/4182 [2:35:11<14:14,  2.74s/it]

Progress: 3870/4182 | EM: 26.95% | F1: 31.90%


Evaluating:  93%|█████████▎| 3880/4182 [2:35:34<12:44,  2.53s/it]

Progress: 3880/4182 | EM: 27.01% | F1: 31.95%


Evaluating:  93%|█████████▎| 3890/4182 [2:35:59<12:37,  2.59s/it]

Progress: 3890/4182 | EM: 27.02% | F1: 31.95%


Evaluating:  93%|█████████▎| 3900/4182 [2:36:23<09:52,  2.10s/it]

Progress: 3900/4182 | EM: 27.05% | F1: 32.01%


Evaluating:  93%|█████████▎| 3910/4182 [2:36:41<10:09,  2.24s/it]

Progress: 3910/4182 | EM: 27.11% | F1: 32.07%


Evaluating:  94%|█████████▎| 3920/4182 [2:36:58<08:04,  1.85s/it]

Progress: 3920/4182 | EM: 27.12% | F1: 32.08%


Evaluating:  94%|█████████▍| 3930/4182 [2:37:19<08:03,  1.92s/it]

Progress: 3930/4182 | EM: 27.12% | F1: 32.08%


Evaluating:  94%|█████████▍| 3940/4182 [2:37:39<09:08,  2.27s/it]

Progress: 3940/4182 | EM: 27.16% | F1: 32.12%


Evaluating:  94%|█████████▍| 3950/4182 [2:38:05<10:20,  2.67s/it]

Progress: 3950/4182 | EM: 27.19% | F1: 32.16%


Evaluating:  95%|█████████▍| 3960/4182 [2:38:31<09:49,  2.66s/it]

Progress: 3960/4182 | EM: 27.27% | F1: 32.25%


Evaluating:  95%|█████████▍| 3970/4182 [2:38:50<05:10,  1.46s/it]

Progress: 3970/4182 | EM: 27.25% | F1: 32.24%


Evaluating:  95%|█████████▌| 3980/4182 [2:39:17<08:49,  2.62s/it]

Progress: 3980/4182 | EM: 27.26% | F1: 32.24%


Evaluating:  95%|█████████▌| 3990/4182 [2:39:37<07:03,  2.21s/it]

Progress: 3990/4182 | EM: 27.32% | F1: 32.29%


Evaluating:  96%|█████████▌| 4000/4182 [2:39:59<07:35,  2.50s/it]

Progress: 4000/4182 | EM: 27.38% | F1: 32.36%


Evaluating:  96%|█████████▌| 4010/4182 [2:40:25<07:30,  2.62s/it]

Progress: 4010/4182 | EM: 27.46% | F1: 32.43%


Evaluating:  96%|█████████▌| 4020/4182 [2:40:48<07:26,  2.76s/it]

Progress: 4020/4182 | EM: 27.51% | F1: 32.48%


Evaluating:  96%|█████████▋| 4030/4182 [2:41:19<07:46,  3.07s/it]

Progress: 4030/4182 | EM: 27.44% | F1: 32.41%


Evaluating:  97%|█████████▋| 4040/4182 [2:41:50<07:20,  3.10s/it]

Progress: 4040/4182 | EM: 27.38% | F1: 32.33%


Evaluating:  97%|█████████▋| 4050/4182 [2:42:21<06:49,  3.10s/it]

Progress: 4050/4182 | EM: 27.31% | F1: 32.27%


Evaluating:  97%|█████████▋| 4060/4182 [2:42:50<05:26,  2.67s/it]

Progress: 4060/4182 | EM: 27.24% | F1: 32.21%


Evaluating:  97%|█████████▋| 4070/4182 [2:43:15<04:09,  2.23s/it]

Progress: 4070/4182 | EM: 27.27% | F1: 32.26%


Evaluating:  98%|█████████▊| 4080/4182 [2:43:31<03:10,  1.87s/it]

Progress: 4080/4182 | EM: 27.25% | F1: 32.27%


Evaluating:  98%|█████████▊| 4090/4182 [2:43:49<02:11,  1.42s/it]

Progress: 4090/4182 | EM: 27.21% | F1: 32.24%


Evaluating:  98%|█████████▊| 4100/4182 [2:44:01<01:08,  1.20it/s]

Progress: 4100/4182 | EM: 27.22% | F1: 32.27%


Evaluating:  98%|█████████▊| 4110/4182 [2:44:16<02:15,  1.88s/it]

Progress: 4110/4182 | EM: 27.18% | F1: 32.29%


Evaluating:  99%|█████████▊| 4120/4182 [2:44:37<01:47,  1.73s/it]

Progress: 4120/4182 | EM: 27.18% | F1: 32.30%


Evaluating:  99%|█████████▉| 4130/4182 [2:45:01<02:16,  2.63s/it]

Progress: 4130/4182 | EM: 27.17% | F1: 32.27%


Evaluating:  99%|█████████▉| 4140/4182 [2:45:16<01:16,  1.82s/it]

Progress: 4140/4182 | EM: 27.13% | F1: 32.23%


Evaluating:  99%|█████████▉| 4150/4182 [2:45:43<01:25,  2.66s/it]

Progress: 4150/4182 | EM: 27.08% | F1: 32.20%


Evaluating:  99%|█████████▉| 4160/4182 [2:46:07<00:51,  2.33s/it]

Progress: 4160/4182 | EM: 27.07% | F1: 32.17%


Evaluating: 100%|█████████▉| 4170/4182 [2:46:33<00:30,  2.51s/it]

Progress: 4170/4182 | EM: 27.05% | F1: 32.15%


Evaluating: 100%|█████████▉| 4180/4182 [2:46:57<00:05,  2.62s/it]

Progress: 4180/4182 | EM: 27.11% | F1: 32.21%


Evaluating: 100%|██████████| 4182/4182 [2:47:00<00:00,  2.40s/it]



EVALUATION RESULTS
Model: unsloth/Llama-3.2-3B-Instruct-bnb-4bit
Samples Evaluated: 4182
Exact Match: 27.14%
F1 Score: 32.24%
Evaluation Time: 167.00 minutes
Speed: 0.42 samples/second
📁 Predictions saved to: ./evaluation_results/predictions_Llama-3.2-3B-Instruct-bnb-4bit_20250608_043934.json
📁 Detailed results saved to: ./evaluation_results/results_Llama-3.2-3B-Instruct-bnb-4bit_20250608_043934.json

🔄 EVALUATING FINE-TUNED MODEL
Loading model from: ./cuad_finetuned_llama3_2_3b
Max sequence length: 65536
==((====))==  Unsloth 2025.6.1: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
unsloth/llama-3.2-3b

Evaluating:   0%|          | 10/4182 [00:08<48:15,  1.44it/s] 

Progress: 10/4182 | EM: 70.00% | F1: 79.41%


Evaluating:   0%|          | 20/4182 [00:10<18:31,  3.74it/s]

Progress: 20/4182 | EM: 85.00% | F1: 89.71%


Evaluating:   1%|          | 30/4182 [00:13<17:24,  3.97it/s]

Progress: 30/4182 | EM: 90.00% | F1: 93.14%


Evaluating:   1%|          | 40/4182 [00:15<17:37,  3.92it/s]

Progress: 40/4182 | EM: 87.50% | F1: 89.96%


Evaluating:   1%|          | 50/4182 [00:27<1:39:03,  1.44s/it]

Progress: 50/4182 | EM: 80.00% | F1: 83.64%


Evaluating:   1%|▏         | 60/4182 [00:32<53:44,  1.28it/s]

Progress: 60/4182 | EM: 81.67% | F1: 84.70%


Evaluating:   2%|▏         | 70/4182 [00:35<19:16,  3.56it/s]

Progress: 70/4182 | EM: 84.29% | F1: 86.89%


Evaluating:   2%|▏         | 80/4182 [00:40<33:21,  2.05it/s]

Progress: 80/4182 | EM: 85.00% | F1: 87.31%


Evaluating:   2%|▏         | 90/4182 [00:55<2:30:19,  2.20s/it]

Progress: 90/4182 | EM: 83.33% | F1: 86.13%


Evaluating:   2%|▏         | 100/4182 [01:08<1:57:54,  1.73s/it]

Progress: 100/4182 | EM: 78.00% | F1: 80.52%


Evaluating:   3%|▎         | 110/4182 [01:27<2:25:24,  2.14s/it]

Progress: 110/4182 | EM: 74.55% | F1: 77.19%


Evaluating:   3%|▎         | 120/4182 [01:35<59:00,  1.15it/s]  

Progress: 120/4182 | EM: 72.50% | F1: 74.98%


Evaluating:   3%|▎         | 130/4182 [01:49<1:36:22,  1.43s/it]

Progress: 130/4182 | EM: 69.23% | F1: 72.10%


Evaluating:   3%|▎         | 140/4182 [02:00<58:58,  1.14it/s]  

Progress: 140/4182 | EM: 68.57% | F1: 71.27%


Evaluating:   4%|▎         | 150/4182 [02:19<2:57:40,  2.64s/it]

Progress: 150/4182 | EM: 66.00% | F1: 68.63%


Evaluating:   4%|▍         | 160/4182 [02:27<59:02,  1.14it/s]  

Progress: 160/4182 | EM: 65.00% | F1: 67.50%


Evaluating:   4%|▍         | 170/4182 [02:32<26:05,  2.56it/s]

Progress: 170/4182 | EM: 64.71% | F1: 67.63%


Evaluating:   4%|▍         | 180/4182 [02:35<16:01,  4.16it/s]

Progress: 180/4182 | EM: 66.67% | F1: 69.43%


Evaluating:   5%|▍         | 190/4182 [02:39<19:13,  3.46it/s]

Progress: 190/4182 | EM: 67.37% | F1: 69.99%


Evaluating:   5%|▍         | 200/4182 [02:42<15:30,  4.28it/s]

Progress: 200/4182 | EM: 69.00% | F1: 71.49%


Evaluating:   5%|▌         | 210/4182 [02:46<39:05,  1.69it/s]

Progress: 210/4182 | EM: 70.48% | F1: 72.84%


Evaluating:   5%|▌         | 220/4182 [03:01<1:19:23,  1.20s/it]

Progress: 220/4182 | EM: 70.91% | F1: 73.17%


Evaluating:   5%|▌         | 230/4182 [03:12<1:25:12,  1.29s/it]

Progress: 230/4182 | EM: 71.30% | F1: 73.50%


Evaluating:   6%|▌         | 240/4182 [03:19<32:57,  1.99it/s]

Progress: 240/4182 | EM: 72.08% | F1: 74.19%


Evaluating:   6%|▌         | 250/4182 [03:27<41:24,  1.58it/s]

Progress: 250/4182 | EM: 71.60% | F1: 74.04%


Evaluating:   6%|▌         | 260/4182 [03:41<1:11:14,  1.09s/it]

Progress: 260/4182 | EM: 70.77% | F1: 73.21%


Evaluating:   6%|▋         | 270/4182 [03:45<26:30,  2.46it/s]

Progress: 270/4182 | EM: 70.37% | F1: 72.77%


Evaluating:   7%|▋         | 280/4182 [03:48<19:31,  3.33it/s]

Progress: 280/4182 | EM: 71.43% | F1: 73.75%


Evaluating:   7%|▋         | 290/4182 [03:52<27:46,  2.34it/s]

Progress: 290/4182 | EM: 71.72% | F1: 73.96%


Evaluating:   7%|▋         | 300/4182 [04:04<1:03:52,  1.01it/s]

Progress: 300/4182 | EM: 71.33% | F1: 73.53%


Evaluating:   7%|▋         | 310/4182 [04:18<2:29:54,  2.32s/it]

Progress: 310/4182 | EM: 70.97% | F1: 73.17%


Evaluating:   8%|▊         | 320/4182 [04:23<28:16,  2.28it/s]

Progress: 320/4182 | EM: 71.56% | F1: 73.79%


Evaluating:   8%|▊         | 330/4182 [04:35<30:13,  2.12it/s]

Progress: 330/4182 | EM: 71.52% | F1: 73.67%


Evaluating:   8%|▊         | 340/4182 [04:39<22:23,  2.86it/s]

Progress: 340/4182 | EM: 72.35% | F1: 74.45%


Evaluating:   8%|▊         | 350/4182 [04:42<16:28,  3.88it/s]

Progress: 350/4182 | EM: 73.14% | F1: 75.18%


Evaluating:   9%|▊         | 360/4182 [04:48<23:31,  2.71it/s]

Progress: 360/4182 | EM: 73.33% | F1: 75.31%


Evaluating:   9%|▉         | 370/4182 [04:54<28:18,  2.24it/s]

Progress: 370/4182 | EM: 73.78% | F1: 75.71%


Evaluating:   9%|▉         | 380/4182 [05:06<56:21,  1.12it/s]  

Progress: 380/4182 | EM: 74.21% | F1: 76.10%


Evaluating:   9%|▉         | 390/4182 [05:16<56:30,  1.12it/s]  

Progress: 390/4182 | EM: 74.62% | F1: 76.45%


Evaluating:  10%|▉         | 400/4182 [05:19<24:04,  2.62it/s]

Progress: 400/4182 | EM: 75.25% | F1: 77.04%


Evaluating:  10%|▉         | 410/4182 [05:23<23:05,  2.72it/s]

Progress: 410/4182 | EM: 75.85% | F1: 77.60%


Evaluating:  10%|█         | 420/4182 [05:34<55:50,  1.12it/s]  

Progress: 420/4182 | EM: 75.48% | F1: 77.24%


Evaluating:  10%|█         | 430/4182 [05:45<1:22:22,  1.32s/it]

Progress: 430/4182 | EM: 75.58% | F1: 77.31%


Evaluating:  11%|█         | 440/4182 [05:52<39:15,  1.59it/s]

Progress: 440/4182 | EM: 75.45% | F1: 77.20%


Evaluating:  11%|█         | 450/4182 [06:00<1:08:45,  1.11s/it]

Progress: 450/4182 | EM: 75.56% | F1: 77.27%


Evaluating:  11%|█         | 460/4182 [06:04<18:27,  3.36it/s]

Progress: 460/4182 | EM: 76.09% | F1: 77.76%


Evaluating:  11%|█         | 470/4182 [06:07<14:20,  4.32it/s]

Progress: 470/4182 | EM: 76.60% | F1: 78.23%


Evaluating:  11%|█▏        | 480/4182 [06:09<14:14,  4.33it/s]

Progress: 480/4182 | EM: 76.88% | F1: 78.48%


Evaluating:  12%|█▏        | 490/4182 [06:11<14:09,  4.35it/s]

Progress: 490/4182 | EM: 77.35% | F1: 78.92%


Evaluating:  12%|█▏        | 500/4182 [06:20<1:11:55,  1.17s/it]

Progress: 500/4182 | EM: 76.80% | F1: 78.82%


Evaluating:  12%|█▏        | 510/4182 [06:29<52:06,  1.17it/s]  

Progress: 510/4182 | EM: 77.06% | F1: 79.04%


Evaluating:  12%|█▏        | 520/4182 [06:39<54:13,  1.13it/s]  

Progress: 520/4182 | EM: 76.54% | F1: 78.58%


Evaluating:  13%|█▎        | 530/4182 [06:46<39:14,  1.55it/s]

Progress: 530/4182 | EM: 76.79% | F1: 78.79%


Evaluating:  13%|█▎        | 540/4182 [07:03<2:02:06,  2.01s/it]

Progress: 540/4182 | EM: 75.74% | F1: 77.82%


Evaluating:  13%|█▎        | 550/4182 [07:07<22:26,  2.70it/s]

Progress: 550/4182 | EM: 76.18% | F1: 78.23%


Evaluating:  13%|█▎        | 560/4182 [07:18<1:35:47,  1.59s/it]

Progress: 560/4182 | EM: 76.25% | F1: 78.26%


Evaluating:  14%|█▎        | 570/4182 [07:27<1:33:30,  1.55s/it]

Progress: 570/4182 | EM: 76.14% | F1: 78.11%


Evaluating:  14%|█▍        | 580/4182 [07:36<1:11:00,  1.18s/it]

Progress: 580/4182 | EM: 75.52% | F1: 77.57%


Evaluating:  14%|█▍        | 590/4182 [07:48<1:23:43,  1.40s/it]

Progress: 590/4182 | EM: 75.76% | F1: 77.78%


Evaluating:  14%|█▍        | 600/4182 [08:05<1:17:35,  1.30s/it]

Progress: 600/4182 | EM: 75.33% | F1: 77.32%


Evaluating:  15%|█▍        | 610/4182 [08:17<1:55:11,  1.94s/it]

Progress: 610/4182 | EM: 75.25% | F1: 77.20%


Evaluating:  15%|█▍        | 620/4182 [08:27<1:05:50,  1.11s/it]

Progress: 620/4182 | EM: 74.84% | F1: 76.81%


Evaluating:  15%|█▌        | 630/4182 [08:37<45:54,  1.29it/s]

Progress: 630/4182 | EM: 75.08% | F1: 77.03%


Evaluating:  15%|█▌        | 640/4182 [08:50<1:13:38,  1.25s/it]

Progress: 640/4182 | EM: 74.84% | F1: 76.76%


Evaluating:  16%|█▌        | 650/4182 [09:04<1:45:52,  1.80s/it]

Progress: 650/4182 | EM: 74.62% | F1: 76.53%


Evaluating:  16%|█▌        | 660/4182 [09:14<40:37,  1.45it/s]

Progress: 660/4182 | EM: 74.55% | F1: 76.55%


Evaluating:  16%|█▌        | 670/4182 [09:20<23:09,  2.53it/s]

Progress: 670/4182 | EM: 74.78% | F1: 76.75%


Evaluating:  16%|█▋        | 680/4182 [09:27<25:31,  2.29it/s]

Progress: 680/4182 | EM: 75.00% | F1: 76.95%


Evaluating:  16%|█▋        | 690/4182 [09:41<1:27:08,  1.50s/it]

Progress: 690/4182 | EM: 74.78% | F1: 76.71%


Evaluating:  17%|█▋        | 700/4182 [09:44<30:33,  1.90it/s]

Progress: 700/4182 | EM: 74.71% | F1: 76.72%


Evaluating:  17%|█▋        | 710/4182 [09:54<32:28,  1.78it/s]

Progress: 710/4182 | EM: 74.37% | F1: 76.34%


Evaluating:  17%|█▋        | 720/4182 [10:00<40:15,  1.43it/s]

Progress: 720/4182 | EM: 74.31% | F1: 76.25%


Evaluating:  17%|█▋        | 730/4182 [10:07<1:15:53,  1.32s/it]

Progress: 730/4182 | EM: 74.52% | F1: 76.45%


Evaluating:  18%|█▊        | 740/4182 [10:13<47:15,  1.21it/s]

Progress: 740/4182 | EM: 74.32% | F1: 76.23%


Evaluating:  18%|█▊        | 750/4182 [10:27<1:04:49,  1.13s/it]

Progress: 750/4182 | EM: 74.13% | F1: 76.05%


Evaluating:  18%|█▊        | 760/4182 [10:46<1:21:46,  1.43s/it]

Progress: 760/4182 | EM: 73.82% | F1: 75.73%


Evaluating:  18%|█▊        | 770/4182 [10:57<53:36,  1.06it/s]

Progress: 770/4182 | EM: 73.38% | F1: 75.28%


Evaluating:  19%|█▊        | 780/4182 [11:14<58:03,  1.02s/it]  

Progress: 780/4182 | EM: 72.95% | F1: 74.85%


Evaluating:  19%|█▉        | 790/4182 [11:17<14:28,  3.91it/s]

Progress: 790/4182 | EM: 73.04% | F1: 74.92%


Evaluating:  19%|█▉        | 800/4182 [11:19<13:08,  4.29it/s]

Progress: 800/4182 | EM: 73.38% | F1: 75.23%


Evaluating:  19%|█▉        | 810/4182 [11:21<13:04,  4.30it/s]

Progress: 810/4182 | EM: 73.70% | F1: 75.54%


Evaluating:  20%|█▉        | 820/4182 [11:24<12:52,  4.35it/s]

Progress: 820/4182 | EM: 74.02% | F1: 75.84%


Evaluating:  20%|█▉        | 830/4182 [11:43<2:19:54,  2.50s/it]

Progress: 830/4182 | EM: 73.61% | F1: 75.43%


Evaluating:  20%|██        | 840/4182 [11:55<1:32:26,  1.66s/it]

Progress: 840/4182 | EM: 73.45% | F1: 75.27%


Evaluating:  20%|██        | 850/4182 [12:10<1:12:34,  1.31s/it]

Progress: 850/4182 | EM: 73.29% | F1: 75.11%


Evaluating:  21%|██        | 860/4182 [12:35<2:32:07,  2.75s/it]

Progress: 860/4182 | EM: 72.91% | F1: 74.76%


Evaluating:  21%|██        | 870/4182 [12:46<1:05:34,  1.19s/it]

Progress: 870/4182 | EM: 72.76% | F1: 74.77%


Evaluating:  21%|██        | 880/4182 [12:56<1:08:08,  1.24s/it]

Progress: 880/4182 | EM: 72.95% | F1: 74.97%


Evaluating:  21%|██▏       | 890/4182 [13:07<51:29,  1.07it/s]

Progress: 890/4182 | EM: 73.03% | F1: 75.03%


Evaluating:  22%|██▏       | 900/4182 [13:22<1:53:08,  2.07s/it]

Progress: 900/4182 | EM: 72.89% | F1: 74.87%


Evaluating:  22%|██▏       | 910/4182 [13:31<1:09:14,  1.27s/it]

Progress: 910/4182 | EM: 72.64% | F1: 74.74%


Evaluating:  22%|██▏       | 920/4182 [13:34<14:15,  3.81it/s]

Progress: 920/4182 | EM: 72.93% | F1: 75.02%


Evaluating:  22%|██▏       | 930/4182 [13:36<12:24,  4.37it/s]

Progress: 930/4182 | EM: 73.23% | F1: 75.29%


Evaluating:  22%|██▏       | 940/4182 [13:38<12:20,  4.38it/s]

Progress: 940/4182 | EM: 73.51% | F1: 75.55%


Evaluating:  23%|██▎       | 950/4182 [13:43<26:57,  2.00it/s]

Progress: 950/4182 | EM: 73.58% | F1: 75.60%


Evaluating:  23%|██▎       | 960/4182 [13:46<14:46,  3.63it/s]

Progress: 960/4182 | EM: 73.85% | F1: 75.85%


Evaluating:  23%|██▎       | 970/4182 [13:50<14:45,  3.63it/s]

Progress: 970/4182 | EM: 74.12% | F1: 76.10%


Evaluating:  23%|██▎       | 980/4182 [13:53<13:52,  3.84it/s]

Progress: 980/4182 | EM: 74.39% | F1: 76.34%


Evaluating:  24%|██▎       | 990/4182 [14:02<1:20:23,  1.51s/it]

Progress: 990/4182 | EM: 74.34% | F1: 76.28%


Evaluating:  24%|██▍       | 1000/4182 [14:16<48:37,  1.09it/s]

Progress: 1000/4182 | EM: 74.20% | F1: 76.13%


Evaluating:  24%|██▍       | 1010/4182 [14:24<43:56,  1.20it/s]

Progress: 1010/4182 | EM: 73.76% | F1: 75.69%


Evaluating:  24%|██▍       | 1020/4182 [14:38<1:11:01,  1.35s/it]

Progress: 1020/4182 | EM: 73.73% | F1: 75.65%


Evaluating:  25%|██▍       | 1030/4182 [14:51<1:20:23,  1.53s/it]

Progress: 1030/4182 | EM: 73.30% | F1: 75.33%


Evaluating:  25%|██▍       | 1040/4182 [15:01<30:30,  1.72it/s]

Progress: 1040/4182 | EM: 73.37% | F1: 75.37%


Evaluating:  25%|██▌       | 1050/4182 [15:12<39:19,  1.33it/s]

Progress: 1050/4182 | EM: 73.24% | F1: 75.23%


Evaluating:  25%|██▌       | 1060/4182 [15:20<33:03,  1.57it/s]

Progress: 1060/4182 | EM: 73.21% | F1: 75.19%


Evaluating:  26%|██▌       | 1070/4182 [15:30<46:21,  1.12it/s]

Progress: 1070/4182 | EM: 73.08% | F1: 75.08%


Evaluating:  26%|██▌       | 1080/4182 [15:44<41:55,  1.23it/s]

Progress: 1080/4182 | EM: 72.87% | F1: 74.85%


Evaluating:  26%|██▌       | 1090/4182 [15:58<1:02:57,  1.22s/it]

Progress: 1090/4182 | EM: 72.75% | F1: 74.73%


Evaluating:  26%|██▋       | 1100/4182 [16:07<34:21,  1.49it/s]

Progress: 1100/4182 | EM: 72.91% | F1: 74.87%


Evaluating:  27%|██▋       | 1110/4182 [16:14<29:43,  1.72it/s]

Progress: 1110/4182 | EM: 72.79% | F1: 74.80%


Evaluating:  27%|██▋       | 1120/4182 [16:20<21:12,  2.41it/s]

Progress: 1120/4182 | EM: 72.77% | F1: 74.77%


Evaluating:  27%|██▋       | 1130/4182 [16:27<21:46,  2.34it/s]

Progress: 1130/4182 | EM: 72.83% | F1: 74.82%


Evaluating:  27%|██▋       | 1140/4182 [16:29<15:19,  3.31it/s]

Progress: 1140/4182 | EM: 73.07% | F1: 75.04%


Evaluating:  27%|██▋       | 1150/4182 [16:35<39:25,  1.28it/s]

Progress: 1150/4182 | EM: 73.13% | F1: 75.08%


Evaluating:  28%|██▊       | 1160/4182 [16:51<59:35,  1.18s/it]  

Progress: 1160/4182 | EM: 72.93% | F1: 74.90%


Evaluating:  28%|██▊       | 1170/4182 [17:04<1:29:15,  1.78s/it]

Progress: 1170/4182 | EM: 72.99% | F1: 74.95%


Evaluating:  28%|██▊       | 1180/4182 [17:14<45:10,  1.11it/s]

Progress: 1180/4182 | EM: 73.05% | F1: 74.99%


Evaluating:  28%|██▊       | 1190/4182 [17:32<51:26,  1.03s/it]  

Progress: 1190/4182 | EM: 72.77% | F1: 74.70%


Evaluating:  29%|██▊       | 1200/4182 [17:40<28:28,  1.75it/s]

Progress: 1200/4182 | EM: 72.58% | F1: 74.57%


Evaluating:  29%|██▉       | 1210/4182 [17:45<36:49,  1.35it/s]

Progress: 1210/4182 | EM: 72.56% | F1: 74.55%


Evaluating:  29%|██▉       | 1220/4182 [17:48<12:29,  3.95it/s]

Progress: 1220/4182 | EM: 72.79% | F1: 74.75%


Evaluating:  29%|██▉       | 1230/4182 [17:52<15:55,  3.09it/s]

Progress: 1230/4182 | EM: 72.93% | F1: 74.89%


Evaluating:  30%|██▉       | 1240/4182 [18:04<55:38,  1.13s/it]  

Progress: 1240/4182 | EM: 72.98% | F1: 74.94%


Evaluating:  30%|██▉       | 1250/4182 [18:13<56:33,  1.16s/it]  

Progress: 1250/4182 | EM: 72.88% | F1: 74.82%


Evaluating:  30%|███       | 1260/4182 [18:23<44:38,  1.09it/s]

Progress: 1260/4182 | EM: 72.70% | F1: 74.64%


Evaluating:  30%|███       | 1270/4182 [18:34<41:38,  1.17it/s]

Progress: 1270/4182 | EM: 72.68% | F1: 74.61%


Evaluating:  31%|███       | 1280/4182 [18:54<1:44:22,  2.16s/it]

Progress: 1280/4182 | EM: 72.58% | F1: 74.49%


Evaluating:  31%|███       | 1290/4182 [19:04<1:01:45,  1.28s/it]

Progress: 1290/4182 | EM: 72.56% | F1: 74.51%


Evaluating:  31%|███       | 1300/4182 [19:15<42:49,  1.12it/s]

Progress: 1300/4182 | EM: 72.62% | F1: 74.56%


Evaluating:  31%|███▏      | 1310/4182 [19:28<1:07:41,  1.41s/it]

Progress: 1310/4182 | EM: 72.44% | F1: 74.40%


Evaluating:  32%|███▏      | 1320/4182 [19:38<56:38,  1.19s/it]

Progress: 1320/4182 | EM: 72.50% | F1: 74.47%


Evaluating:  32%|███▏      | 1330/4182 [19:48<55:05,  1.16s/it]  

Progress: 1330/4182 | EM: 72.63% | F1: 74.58%


Evaluating:  32%|███▏      | 1340/4182 [19:55<19:23,  2.44it/s]

Progress: 1340/4182 | EM: 72.76% | F1: 74.70%


Evaluating:  32%|███▏      | 1350/4182 [19:58<16:16,  2.90it/s]

Progress: 1350/4182 | EM: 72.89% | F1: 74.82%


Evaluating:  33%|███▎      | 1360/4182 [20:02<15:16,  3.08it/s]

Progress: 1360/4182 | EM: 72.94% | F1: 74.86%


Evaluating:  33%|███▎      | 1370/4182 [20:06<15:08,  3.09it/s]

Progress: 1370/4182 | EM: 73.14% | F1: 75.05%


Evaluating:  33%|███▎      | 1380/4182 [20:10<14:55,  3.13it/s]

Progress: 1380/4182 | EM: 73.26% | F1: 75.17%


Evaluating:  33%|███▎      | 1390/4182 [20:13<13:56,  3.34it/s]

Progress: 1390/4182 | EM: 73.45% | F1: 75.35%


Evaluating:  33%|███▎      | 1400/4182 [20:17<13:20,  3.48it/s]

Progress: 1400/4182 | EM: 73.50% | F1: 75.38%


Evaluating:  34%|███▎      | 1410/4182 [20:19<10:55,  4.23it/s]

Progress: 1410/4182 | EM: 73.69% | F1: 75.56%


Evaluating:  34%|███▍      | 1420/4182 [20:21<10:42,  4.30it/s]

Progress: 1420/4182 | EM: 73.87% | F1: 75.73%


Evaluating:  34%|███▍      | 1430/4182 [20:24<10:29,  4.37it/s]

Progress: 1430/4182 | EM: 74.06% | F1: 75.90%


Evaluating:  34%|███▍      | 1440/4182 [20:28<18:54,  2.42it/s]

Progress: 1440/4182 | EM: 74.03% | F1: 75.92%


Evaluating:  35%|███▍      | 1450/4182 [20:31<11:54,  3.83it/s]

Progress: 1450/4182 | EM: 74.14% | F1: 76.03%


Evaluating:  35%|███▍      | 1460/4182 [20:33<10:26,  4.34it/s]

Progress: 1460/4182 | EM: 74.32% | F1: 76.19%


Evaluating:  35%|███▌      | 1470/4182 [20:36<10:28,  4.32it/s]

Progress: 1470/4182 | EM: 74.49% | F1: 76.35%


Evaluating:  35%|███▌      | 1480/4182 [20:39<18:37,  2.42it/s]

Progress: 1480/4182 | EM: 74.46% | F1: 76.36%


Evaluating:  36%|███▌      | 1490/4182 [20:50<32:13,  1.39it/s]

Progress: 1490/4182 | EM: 74.36% | F1: 76.30%


Evaluating:  36%|███▌      | 1500/4182 [21:01<42:15,  1.06it/s]

Progress: 1500/4182 | EM: 74.13% | F1: 76.12%


Evaluating:  36%|███▌      | 1510/4182 [21:05<15:14,  2.92it/s]

Progress: 1510/4182 | EM: 74.24% | F1: 76.21%


Evaluating:  36%|███▋      | 1520/4182 [21:11<28:41,  1.55it/s]

Progress: 1520/4182 | EM: 74.21% | F1: 76.17%


Evaluating:  37%|███▋      | 1530/4182 [21:24<28:37,  1.54it/s]

Progress: 1530/4182 | EM: 74.12% | F1: 76.08%


Evaluating:  37%|███▋      | 1540/4182 [21:30<24:45,  1.78it/s]

Progress: 1540/4182 | EM: 74.09% | F1: 76.04%


Evaluating:  37%|███▋      | 1550/4182 [21:38<24:10,  1.81it/s]

Progress: 1550/4182 | EM: 74.00% | F1: 75.97%


Evaluating:  37%|███▋      | 1560/4182 [21:41<16:44,  2.61it/s]

Progress: 1560/4182 | EM: 74.17% | F1: 76.13%


Evaluating:  38%|███▊      | 1570/4182 [21:44<10:16,  4.24it/s]

Progress: 1570/4182 | EM: 74.33% | F1: 76.28%


Evaluating:  38%|███▊      | 1580/4182 [21:46<10:03,  4.31it/s]

Progress: 1580/4182 | EM: 74.49% | F1: 76.43%


Evaluating:  38%|███▊      | 1590/4182 [21:48<09:54,  4.36it/s]

Progress: 1590/4182 | EM: 74.65% | F1: 76.58%


Evaluating:  38%|███▊      | 1600/4182 [21:51<15:14,  2.82it/s]

Progress: 1600/4182 | EM: 74.81% | F1: 76.72%


Evaluating:  38%|███▊      | 1610/4182 [22:04<1:10:46,  1.65s/it]

Progress: 1610/4182 | EM: 74.72% | F1: 76.67%


Evaluating:  39%|███▊      | 1620/4182 [22:13<37:19,  1.14it/s]

Progress: 1620/4182 | EM: 74.63% | F1: 76.57%


Evaluating:  39%|███▉      | 1630/4182 [22:20<25:08,  1.69it/s]

Progress: 1630/4182 | EM: 74.60% | F1: 76.53%


Evaluating:  39%|███▉      | 1640/4182 [22:24<17:31,  2.42it/s]

Progress: 1640/4182 | EM: 74.70% | F1: 76.62%


Evaluating:  39%|███▉      | 1650/4182 [22:39<47:26,  1.12s/it]

Progress: 1650/4182 | EM: 74.55% | F1: 76.54%


Evaluating:  40%|███▉      | 1660/4182 [22:49<39:24,  1.07it/s]

Progress: 1660/4182 | EM: 74.52% | F1: 76.50%


Evaluating:  40%|███▉      | 1670/4182 [22:56<28:19,  1.48it/s]

Progress: 1670/4182 | EM: 74.55% | F1: 76.53%


Evaluating:  40%|████      | 1680/4182 [23:14<1:13:36,  1.77s/it]

Progress: 1680/4182 | EM: 74.46% | F1: 76.44%


Evaluating:  40%|████      | 1690/4182 [23:25<57:46,  1.39s/it]  

Progress: 1690/4182 | EM: 74.44% | F1: 76.41%


Evaluating:  41%|████      | 1700/4182 [23:33<36:52,  1.12it/s]

Progress: 1700/4182 | EM: 74.29% | F1: 76.28%


Evaluating:  41%|████      | 1710/4182 [23:37<14:46,  2.79it/s]

Progress: 1710/4182 | EM: 74.39% | F1: 76.36%


Evaluating:  41%|████      | 1720/4182 [23:40<14:01,  2.93it/s]

Progress: 1720/4182 | EM: 74.53% | F1: 76.50%


Evaluating:  41%|████▏     | 1730/4182 [23:54<1:03:35,  1.56s/it]

Progress: 1730/4182 | EM: 74.57% | F1: 76.52%


Evaluating:  42%|████▏     | 1740/4182 [24:06<1:08:05,  1.67s/it]

Progress: 1740/4182 | EM: 74.54% | F1: 76.49%


Evaluating:  42%|████▏     | 1750/4182 [24:28<1:31:14,  2.25s/it]

Progress: 1750/4182 | EM: 74.51% | F1: 76.45%


Evaluating:  42%|████▏     | 1760/4182 [24:42<1:02:02,  1.54s/it]

Progress: 1760/4182 | EM: 74.55% | F1: 76.48%


Evaluating:  42%|████▏     | 1770/4182 [24:55<45:51,  1.14s/it]

Progress: 1770/4182 | EM: 74.35% | F1: 76.32%


Evaluating:  43%|████▎     | 1780/4182 [25:05<43:23,  1.08s/it]

Progress: 1780/4182 | EM: 74.44% | F1: 76.40%


Evaluating:  43%|████▎     | 1790/4182 [25:16<31:47,  1.25it/s]

Progress: 1790/4182 | EM: 74.53% | F1: 76.47%


Evaluating:  43%|████▎     | 1800/4182 [25:30<1:11:38,  1.80s/it]

Progress: 1800/4182 | EM: 74.56% | F1: 76.49%


Evaluating:  43%|████▎     | 1810/4182 [25:37<21:55,  1.80it/s]

Progress: 1810/4182 | EM: 74.42% | F1: 76.37%


Evaluating:  44%|████▎     | 1820/4182 [25:42<13:38,  2.88it/s]

Progress: 1820/4182 | EM: 74.51% | F1: 76.44%


Evaluating:  44%|████▍     | 1830/4182 [25:47<14:09,  2.77it/s]

Progress: 1830/4182 | EM: 74.48% | F1: 76.41%


Evaluating:  44%|████▍     | 1840/4182 [25:50<11:45,  3.32it/s]

Progress: 1840/4182 | EM: 74.62% | F1: 76.54%


Evaluating:  44%|████▍     | 1850/4182 [25:59<1:07:00,  1.72s/it]

Progress: 1850/4182 | EM: 74.65% | F1: 76.56%


Evaluating:  44%|████▍     | 1860/4182 [26:08<32:02,  1.21it/s]

Progress: 1860/4182 | EM: 74.78% | F1: 76.68%


Evaluating:  45%|████▍     | 1870/4182 [26:18<33:17,  1.16it/s]

Progress: 1870/4182 | EM: 74.76% | F1: 76.65%


Evaluating:  45%|████▍     | 1880/4182 [26:29<57:35,  1.50s/it]  

Progress: 1880/4182 | EM: 74.89% | F1: 76.77%


Evaluating:  45%|████▌     | 1890/4182 [26:38<39:39,  1.04s/it]

Progress: 1890/4182 | EM: 74.81% | F1: 76.69%


Evaluating:  45%|████▌     | 1900/4182 [26:52<40:24,  1.06s/it]

Progress: 1900/4182 | EM: 74.58% | F1: 76.47%


Evaluating:  46%|████▌     | 1910/4182 [27:07<44:40,  1.18s/it]

Progress: 1910/4182 | EM: 74.50% | F1: 76.39%


Evaluating:  46%|████▌     | 1920/4182 [27:18<33:20,  1.13it/s]

Progress: 1920/4182 | EM: 74.38% | F1: 76.25%


Evaluating:  46%|████▌     | 1930/4182 [27:26<26:36,  1.41it/s]

Progress: 1930/4182 | EM: 74.35% | F1: 76.22%


Evaluating:  46%|████▋     | 1940/4182 [27:33<17:19,  2.16it/s]

Progress: 1940/4182 | EM: 74.38% | F1: 76.24%


Evaluating:  47%|████▋     | 1950/4182 [27:36<13:46,  2.70it/s]

Progress: 1950/4182 | EM: 74.46% | F1: 76.31%


Evaluating:  47%|████▋     | 1960/4182 [27:40<11:23,  3.25it/s]

Progress: 1960/4182 | EM: 74.59% | F1: 76.43%


Evaluating:  47%|████▋     | 1970/4182 [27:43<15:36,  2.36it/s]

Progress: 1970/4182 | EM: 74.67% | F1: 76.50%


Evaluating:  47%|████▋     | 1980/4182 [27:51<15:47,  2.32it/s]

Progress: 1980/4182 | EM: 74.65% | F1: 76.47%


Evaluating:  48%|████▊     | 1990/4182 [27:54<09:41,  3.77it/s]

Progress: 1990/4182 | EM: 74.72% | F1: 76.54%


Evaluating:  48%|████▊     | 2000/4182 [27:56<09:27,  3.85it/s]

Progress: 2000/4182 | EM: 74.85% | F1: 76.66%


Evaluating:  48%|████▊     | 2010/4182 [27:59<13:05,  2.76it/s]

Progress: 2010/4182 | EM: 74.98% | F1: 76.77%


Evaluating:  48%|████▊     | 2020/4182 [28:07<24:25,  1.48it/s]

Progress: 2020/4182 | EM: 75.00% | F1: 76.79%


Evaluating:  49%|████▊     | 2030/4182 [28:10<15:51,  2.26it/s]

Progress: 2030/4182 | EM: 75.07% | F1: 76.87%


Evaluating:  49%|████▉     | 2040/4182 [28:16<15:10,  2.35it/s]

Progress: 2040/4182 | EM: 75.10% | F1: 76.88%


Evaluating:  49%|████▉     | 2050/4182 [28:24<24:16,  1.46it/s]

Progress: 2050/4182 | EM: 75.12% | F1: 76.90%


Evaluating:  49%|████▉     | 2060/4182 [28:33<27:40,  1.28it/s]

Progress: 2060/4182 | EM: 75.10% | F1: 76.88%


Evaluating:  49%|████▉     | 2070/4182 [28:43<47:20,  1.34s/it]

Progress: 2070/4182 | EM: 75.07% | F1: 76.88%


Evaluating:  50%|████▉     | 2080/4182 [28:53<31:52,  1.10it/s]

Progress: 2080/4182 | EM: 75.05% | F1: 76.85%


Evaluating:  50%|████▉     | 2090/4182 [29:05<1:02:27,  1.79s/it]

Progress: 2090/4182 | EM: 74.98% | F1: 76.78%


Evaluating:  50%|█████     | 2100/4182 [29:15<41:11,  1.19s/it]

Progress: 2100/4182 | EM: 74.86% | F1: 76.65%


Evaluating:  50%|█████     | 2110/4182 [29:23<29:47,  1.16it/s]

Progress: 2110/4182 | EM: 74.74% | F1: 76.56%


Evaluating:  51%|█████     | 2120/4182 [29:36<35:35,  1.04s/it]

Progress: 2120/4182 | EM: 74.58% | F1: 76.39%


Evaluating:  51%|█████     | 2130/4182 [29:40<14:06,  2.42it/s]

Progress: 2130/4182 | EM: 74.60% | F1: 76.40%


Evaluating:  51%|█████     | 2140/4182 [29:49<29:14,  1.16it/s]

Progress: 2140/4182 | EM: 74.58% | F1: 76.41%


Evaluating:  51%|█████▏    | 2150/4182 [29:52<09:51,  3.43it/s]

Progress: 2150/4182 | EM: 74.70% | F1: 76.52%


Evaluating:  52%|█████▏    | 2160/4182 [29:54<09:12,  3.66it/s]

Progress: 2160/4182 | EM: 74.81% | F1: 76.63%


Evaluating:  52%|█████▏    | 2170/4182 [29:57<09:04,  3.70it/s]

Progress: 2170/4182 | EM: 74.93% | F1: 76.73%


Evaluating:  52%|█████▏    | 2180/4182 [30:01<09:29,  3.51it/s]

Progress: 2180/4182 | EM: 74.91% | F1: 76.70%


Evaluating:  52%|█████▏    | 2190/4182 [30:03<07:41,  4.31it/s]

Progress: 2190/4182 | EM: 75.02% | F1: 76.81%


Evaluating:  53%|█████▎    | 2200/4182 [30:05<07:23,  4.47it/s]

Progress: 2200/4182 | EM: 75.14% | F1: 76.92%


Evaluating:  53%|█████▎    | 2210/4182 [30:08<07:32,  4.36it/s]

Progress: 2210/4182 | EM: 75.25% | F1: 77.02%


Evaluating:  53%|█████▎    | 2220/4182 [30:15<40:09,  1.23s/it]

Progress: 2220/4182 | EM: 75.23% | F1: 76.99%


Evaluating:  53%|█████▎    | 2230/4182 [30:19<09:50,  3.31it/s]

Progress: 2230/4182 | EM: 75.25% | F1: 77.01%


Evaluating:  54%|█████▎    | 2240/4182 [30:24<10:43,  3.02it/s]

Progress: 2240/4182 | EM: 75.27% | F1: 77.02%


Evaluating:  54%|█████▍    | 2250/4182 [30:27<07:58,  4.04it/s]

Progress: 2250/4182 | EM: 75.38% | F1: 77.13%


Evaluating:  54%|█████▍    | 2260/4182 [30:34<21:21,  1.50it/s]

Progress: 2260/4182 | EM: 75.40% | F1: 77.14%


Evaluating:  54%|█████▍    | 2270/4182 [30:36<07:43,  4.13it/s]

Progress: 2270/4182 | EM: 75.51% | F1: 77.24%


Evaluating:  55%|█████▍    | 2280/4182 [30:39<07:21,  4.31it/s]

Progress: 2280/4182 | EM: 75.61% | F1: 77.34%


Evaluating:  55%|█████▍    | 2290/4182 [30:41<07:24,  4.26it/s]

Progress: 2290/4182 | EM: 75.72% | F1: 77.44%


Evaluating:  55%|█████▍    | 2300/4182 [30:49<36:04,  1.15s/it]

Progress: 2300/4182 | EM: 75.70% | F1: 77.43%


Evaluating:  55%|█████▌    | 2310/4182 [31:01<26:14,  1.19it/s]

Progress: 2310/4182 | EM: 75.71% | F1: 77.44%


Evaluating:  55%|█████▌    | 2320/4182 [31:18<37:31,  1.21s/it]

Progress: 2320/4182 | EM: 75.65% | F1: 77.37%


Evaluating:  56%|█████▌    | 2330/4182 [31:29<55:22,  1.79s/it]

Progress: 2330/4182 | EM: 75.75% | F1: 77.47%


Evaluating:  56%|█████▌    | 2340/4182 [31:37<26:30,  1.16it/s]

Progress: 2340/4182 | EM: 75.60% | F1: 77.34%


Evaluating:  56%|█████▌    | 2350/4182 [31:51<31:33,  1.03s/it]

Progress: 2350/4182 | EM: 75.49% | F1: 77.22%


Evaluating:  56%|█████▋    | 2360/4182 [32:09<44:11,  1.46s/it]

Progress: 2360/4182 | EM: 75.38% | F1: 77.12%


Evaluating:  57%|█████▋    | 2370/4182 [32:32<1:18:17,  2.59s/it]

Progress: 2370/4182 | EM: 75.23% | F1: 76.97%


Evaluating:  57%|█████▋    | 2380/4182 [32:46<34:05,  1.14s/it]

Progress: 2380/4182 | EM: 75.08% | F1: 76.84%


Evaluating:  57%|█████▋    | 2390/4182 [32:57<21:45,  1.37it/s]

Progress: 2390/4182 | EM: 75.06% | F1: 76.81%


Evaluating:  57%|█████▋    | 2400/4182 [33:05<40:59,  1.38s/it]

Progress: 2400/4182 | EM: 75.00% | F1: 76.76%


Evaluating:  58%|█████▊    | 2410/4182 [33:10<14:29,  2.04it/s]

Progress: 2410/4182 | EM: 75.10% | F1: 76.85%


Evaluating:  58%|█████▊    | 2420/4182 [33:19<27:36,  1.06it/s]

Progress: 2420/4182 | EM: 75.04% | F1: 76.78%


Evaluating:  58%|█████▊    | 2430/4182 [33:26<14:54,  1.96it/s]

Progress: 2430/4182 | EM: 74.94% | F1: 76.70%


Evaluating:  58%|█████▊    | 2440/4182 [33:32<17:12,  1.69it/s]

Progress: 2440/4182 | EM: 74.96% | F1: 76.72%


Evaluating:  59%|█████▊    | 2450/4182 [33:35<08:08,  3.54it/s]

Progress: 2450/4182 | EM: 75.02% | F1: 76.77%


Evaluating:  59%|█████▉    | 2460/4182 [33:39<10:15,  2.80it/s]

Progress: 2460/4182 | EM: 75.04% | F1: 76.78%


Evaluating:  59%|█████▉    | 2470/4182 [33:41<06:49,  4.18it/s]

Progress: 2470/4182 | EM: 75.10% | F1: 76.84%


Evaluating:  59%|█████▉    | 2480/4182 [33:44<06:31,  4.34it/s]

Progress: 2480/4182 | EM: 75.20% | F1: 76.93%


Evaluating:  60%|█████▉    | 2490/4182 [33:46<06:33,  4.30it/s]

Progress: 2490/4182 | EM: 75.30% | F1: 77.02%


Evaluating:  60%|█████▉    | 2500/4182 [33:48<06:28,  4.33it/s]

Progress: 2500/4182 | EM: 75.40% | F1: 77.11%


Evaluating:  60%|██████    | 2510/4182 [34:09<1:15:53,  2.72s/it]

Progress: 2510/4182 | EM: 75.34% | F1: 77.05%


Evaluating:  60%|██████    | 2520/4182 [34:28<45:28,  1.64s/it]

Progress: 2520/4182 | EM: 75.04% | F1: 76.77%


Evaluating:  60%|██████    | 2530/4182 [34:46<38:10,  1.39s/it]

Progress: 2530/4182 | EM: 74.94% | F1: 76.67%


Evaluating:  61%|██████    | 2540/4182 [35:05<1:09:21,  2.53s/it]

Progress: 2540/4182 | EM: 74.84% | F1: 76.57%


Evaluating:  61%|██████    | 2550/4182 [35:15<34:12,  1.26s/it]

Progress: 2550/4182 | EM: 74.75% | F1: 76.47%


Evaluating:  61%|██████    | 2560/4182 [35:20<11:20,  2.38it/s]

Progress: 2560/4182 | EM: 74.77% | F1: 76.49%


Evaluating:  61%|██████▏   | 2570/4182 [35:33<29:42,  1.11s/it]

Progress: 2570/4182 | EM: 74.63% | F1: 76.37%


Evaluating:  62%|██████▏   | 2580/4182 [35:37<10:07,  2.64it/s]

Progress: 2580/4182 | EM: 74.69% | F1: 76.42%


Evaluating:  62%|██████▏   | 2590/4182 [35:42<12:16,  2.16it/s]

Progress: 2590/4182 | EM: 74.63% | F1: 76.37%


Evaluating:  62%|██████▏   | 2600/4182 [35:44<06:13,  4.24it/s]

Progress: 2600/4182 | EM: 74.69% | F1: 76.42%


Evaluating:  62%|██████▏   | 2610/4182 [35:46<05:57,  4.39it/s]

Progress: 2610/4182 | EM: 74.79% | F1: 76.51%


Evaluating:  63%|██████▎   | 2620/4182 [35:49<06:00,  4.34it/s]

Progress: 2620/4182 | EM: 74.85% | F1: 76.56%


Evaluating:  63%|██████▎   | 2630/4182 [35:56<36:57,  1.43s/it]

Progress: 2630/4182 | EM: 74.87% | F1: 76.58%


Evaluating:  63%|██████▎   | 2640/4182 [36:04<18:36,  1.38it/s]

Progress: 2640/4182 | EM: 74.89% | F1: 76.59%


Evaluating:  63%|██████▎   | 2650/4182 [36:11<12:01,  2.12it/s]

Progress: 2650/4182 | EM: 74.87% | F1: 76.58%


Evaluating:  64%|██████▎   | 2660/4182 [36:18<34:12,  1.35s/it]

Progress: 2660/4182 | EM: 74.89% | F1: 76.60%


Evaluating:  64%|██████▍   | 2670/4182 [36:25<29:22,  1.17s/it]

Progress: 2670/4182 | EM: 74.87% | F1: 76.61%


Evaluating:  64%|██████▍   | 2680/4182 [36:36<20:36,  1.21it/s]

Progress: 2680/4182 | EM: 74.93% | F1: 76.65%


Evaluating:  64%|██████▍   | 2690/4182 [36:45<18:20,  1.36it/s]

Progress: 2690/4182 | EM: 74.91% | F1: 76.63%


Evaluating:  65%|██████▍   | 2700/4182 [36:53<13:41,  1.80it/s]

Progress: 2700/4182 | EM: 74.85% | F1: 76.57%


Evaluating:  65%|██████▍   | 2710/4182 [37:04<21:20,  1.15it/s]

Progress: 2710/4182 | EM: 74.80% | F1: 76.54%


Evaluating:  65%|██████▌   | 2720/4182 [37:19<25:02,  1.03s/it]

Progress: 2720/4182 | EM: 74.71% | F1: 76.46%


Evaluating:  65%|██████▌   | 2730/4182 [37:40<1:01:36,  2.55s/it]

Progress: 2730/4182 | EM: 74.54% | F1: 76.29%


Evaluating:  66%|██████▌   | 2740/4182 [37:48<19:48,  1.21it/s]

Progress: 2740/4182 | EM: 74.53% | F1: 76.27%


Evaluating:  66%|██████▌   | 2750/4182 [37:56<16:21,  1.46it/s]

Progress: 2750/4182 | EM: 74.47% | F1: 76.22%


Evaluating:  66%|██████▌   | 2760/4182 [38:09<15:36,  1.52it/s]

Progress: 2760/4182 | EM: 74.42% | F1: 76.16%


Evaluating:  66%|██████▌   | 2770/4182 [38:15<10:00,  2.35it/s]

Progress: 2770/4182 | EM: 74.48% | F1: 76.21%


Evaluating:  66%|██████▋   | 2780/4182 [38:18<07:02,  3.32it/s]

Progress: 2780/4182 | EM: 74.50% | F1: 76.22%


Evaluating:  67%|██████▋   | 2790/4182 [38:22<07:20,  3.16it/s]

Progress: 2790/4182 | EM: 74.48% | F1: 76.20%


Evaluating:  67%|██████▋   | 2800/4182 [38:32<23:13,  1.01s/it]

Progress: 2800/4182 | EM: 74.50% | F1: 76.22%


Evaluating:  67%|██████▋   | 2810/4182 [38:41<11:19,  2.02it/s]

Progress: 2810/4182 | EM: 74.52% | F1: 76.24%


Evaluating:  67%|██████▋   | 2820/4182 [38:56<19:14,  1.18it/s]

Progress: 2820/4182 | EM: 74.50% | F1: 76.21%


Evaluating:  68%|██████▊   | 2830/4182 [39:05<15:30,  1.45it/s]

Progress: 2830/4182 | EM: 74.45% | F1: 76.16%


Evaluating:  68%|██████▊   | 2840/4182 [39:17<38:15,  1.71s/it]

Progress: 2840/4182 | EM: 74.44% | F1: 76.14%


Evaluating:  68%|██████▊   | 2850/4182 [39:26<17:40,  1.26it/s]

Progress: 2850/4182 | EM: 74.39% | F1: 76.10%


Evaluating:  68%|██████▊   | 2860/4182 [39:34<12:46,  1.72it/s]

Progress: 2860/4182 | EM: 74.44% | F1: 76.15%


Evaluating:  69%|██████▊   | 2870/4182 [39:43<13:43,  1.59it/s]

Progress: 2870/4182 | EM: 74.43% | F1: 76.13%


Evaluating:  69%|██████▉   | 2880/4182 [39:53<19:16,  1.13it/s]

Progress: 2880/4182 | EM: 74.34% | F1: 76.08%


Evaluating:  69%|██████▉   | 2890/4182 [40:07<41:56,  1.95s/it]

Progress: 2890/4182 | EM: 74.36% | F1: 76.10%


Evaluating:  69%|██████▉   | 2900/4182 [40:16<17:12,  1.24it/s]

Progress: 2900/4182 | EM: 74.38% | F1: 76.13%


Evaluating:  70%|██████▉   | 2910/4182 [40:31<24:04,  1.14s/it]

Progress: 2910/4182 | EM: 74.43% | F1: 76.17%


Evaluating:  70%|██████▉   | 2920/4182 [40:47<38:07,  1.81s/it]

Progress: 2920/4182 | EM: 74.32% | F1: 76.05%


Evaluating:  70%|███████   | 2930/4182 [40:57<23:08,  1.11s/it]

Progress: 2930/4182 | EM: 74.30% | F1: 76.04%


Evaluating:  70%|███████   | 2940/4182 [41:07<12:34,  1.65it/s]

Progress: 2940/4182 | EM: 74.29% | F1: 76.02%


Evaluating:  71%|███████   | 2950/4182 [41:10<06:44,  3.05it/s]

Progress: 2950/4182 | EM: 74.34% | F1: 76.07%


Evaluating:  71%|███████   | 2960/4182 [41:19<21:45,  1.07s/it]

Progress: 2960/4182 | EM: 74.29% | F1: 76.08%


Evaluating:  71%|███████   | 2970/4182 [41:25<14:41,  1.37it/s]

Progress: 2970/4182 | EM: 74.34% | F1: 76.13%


Evaluating:  71%|███████▏  | 2980/4182 [41:32<13:50,  1.45it/s]

Progress: 2980/4182 | EM: 74.30% | F1: 76.08%


Evaluating:  71%|███████▏  | 2990/4182 [41:38<12:55,  1.54it/s]

Progress: 2990/4182 | EM: 74.25% | F1: 76.03%


Evaluating:  72%|███████▏  | 3000/4182 [41:51<23:02,  1.17s/it]

Progress: 3000/4182 | EM: 74.20% | F1: 75.98%


Evaluating:  72%|███████▏  | 3010/4182 [42:04<27:37,  1.41s/it]

Progress: 3010/4182 | EM: 74.15% | F1: 75.92%


Evaluating:  72%|███████▏  | 3020/4182 [42:21<24:06,  1.24s/it]

Progress: 3020/4182 | EM: 74.11% | F1: 75.88%


Evaluating:  72%|███████▏  | 3030/4182 [42:38<36:54,  1.92s/it]

Progress: 3030/4182 | EM: 74.06% | F1: 75.84%


Evaluating:  73%|███████▎  | 3040/4182 [42:46<08:11,  2.32it/s]

Progress: 3040/4182 | EM: 73.98% | F1: 75.75%


Evaluating:  73%|███████▎  | 3050/4182 [42:48<04:32,  4.15it/s]

Progress: 3050/4182 | EM: 74.07% | F1: 75.83%


Evaluating:  73%|███████▎  | 3060/4182 [42:51<04:21,  4.29it/s]

Progress: 3060/4182 | EM: 74.15% | F1: 75.91%


Evaluating:  73%|███████▎  | 3070/4182 [42:53<04:18,  4.30it/s]

Progress: 3070/4182 | EM: 74.23% | F1: 75.99%


Evaluating:  74%|███████▎  | 3080/4182 [43:05<44:36,  2.43s/it]

Progress: 3080/4182 | EM: 74.22% | F1: 75.97%


Evaluating:  74%|███████▍  | 3090/4182 [43:15<16:33,  1.10it/s]

Progress: 3090/4182 | EM: 74.24% | F1: 75.99%


Evaluating:  74%|███████▍  | 3100/4182 [43:32<28:31,  1.58s/it]

Progress: 3100/4182 | EM: 74.16% | F1: 75.93%


Evaluating:  74%|███████▍  | 3110/4182 [43:53<43:42,  2.45s/it]

Progress: 3110/4182 | EM: 74.08% | F1: 75.84%


Evaluating:  75%|███████▍  | 3120/4182 [44:08<21:18,  1.20s/it]

Progress: 3120/4182 | EM: 74.01% | F1: 75.79%


Evaluating:  75%|███████▍  | 3130/4182 [44:19<14:52,  1.18it/s]

Progress: 3130/4182 | EM: 73.99% | F1: 75.78%


Evaluating:  75%|███████▌  | 3140/4182 [44:31<17:06,  1.02it/s]

Progress: 3140/4182 | EM: 73.95% | F1: 75.74%


Evaluating:  75%|███████▌  | 3150/4182 [44:47<30:09,  1.75s/it]

Progress: 3150/4182 | EM: 73.87% | F1: 75.67%


Evaluating:  76%|███████▌  | 3160/4182 [44:59<17:20,  1.02s/it]

Progress: 3160/4182 | EM: 73.83% | F1: 75.63%


Evaluating:  76%|███████▌  | 3170/4182 [45:12<14:29,  1.16it/s]

Progress: 3170/4182 | EM: 73.82% | F1: 75.62%


Evaluating:  76%|███████▌  | 3180/4182 [45:23<14:04,  1.19it/s]

Progress: 3180/4182 | EM: 73.84% | F1: 75.64%


Evaluating:  76%|███████▋  | 3190/4182 [45:32<27:53,  1.69s/it]

Progress: 3190/4182 | EM: 73.89% | F1: 75.69%


Evaluating:  77%|███████▋  | 3200/4182 [45:42<21:10,  1.29s/it]

Progress: 3200/4182 | EM: 73.97% | F1: 75.76%


Evaluating:  77%|███████▋  | 3210/4182 [45:52<13:42,  1.18it/s]

Progress: 3210/4182 | EM: 73.89% | F1: 75.69%


Evaluating:  77%|███████▋  | 3220/4182 [46:02<17:31,  1.09s/it]

Progress: 3220/4182 | EM: 73.76% | F1: 75.56%


Evaluating:  77%|███████▋  | 3230/4182 [46:11<12:32,  1.27it/s]

Progress: 3230/4182 | EM: 73.75% | F1: 75.54%


Evaluating:  77%|███████▋  | 3240/4182 [46:19<13:31,  1.16it/s]

Progress: 3240/4182 | EM: 73.58% | F1: 75.40%


Evaluating:  78%|███████▊  | 3250/4182 [46:39<41:13,  2.65s/it]

Progress: 3250/4182 | EM: 73.48% | F1: 75.30%


Evaluating:  78%|███████▊  | 3260/4182 [46:57<26:59,  1.76s/it]

Progress: 3260/4182 | EM: 73.40% | F1: 75.23%


Evaluating:  78%|███████▊  | 3270/4182 [47:11<15:00,  1.01it/s]

Progress: 3270/4182 | EM: 73.39% | F1: 75.21%


Evaluating:  78%|███████▊  | 3280/4182 [47:27<20:05,  1.34s/it]

Progress: 3280/4182 | EM: 73.35% | F1: 75.17%


Evaluating:  79%|███████▊  | 3290/4182 [47:34<09:29,  1.57it/s]

Progress: 3290/4182 | EM: 73.31% | F1: 75.13%


Evaluating:  79%|███████▉  | 3300/4182 [47:37<06:08,  2.40it/s]

Progress: 3300/4182 | EM: 73.33% | F1: 75.15%


Evaluating:  79%|███████▉  | 3310/4182 [47:40<03:47,  3.83it/s]

Progress: 3310/4182 | EM: 73.41% | F1: 75.23%


Evaluating:  79%|███████▉  | 3320/4182 [47:46<05:25,  2.65it/s]

Progress: 3320/4182 | EM: 73.46% | F1: 75.27%


Evaluating:  80%|███████▉  | 3330/4182 [48:04<26:33,  1.87s/it]

Progress: 3330/4182 | EM: 73.45% | F1: 75.26%


Evaluating:  80%|███████▉  | 3340/4182 [48:23<21:30,  1.53s/it]

Progress: 3340/4182 | EM: 73.50% | F1: 75.30%


Evaluating:  80%|████████  | 3350/4182 [48:40<15:12,  1.10s/it]

Progress: 3350/4182 | EM: 73.37% | F1: 75.17%


Evaluating:  80%|████████  | 3360/4182 [48:58<18:14,  1.33s/it]

Progress: 3360/4182 | EM: 73.33% | F1: 75.13%


Evaluating:  81%|████████  | 3370/4182 [49:09<20:43,  1.53s/it]

Progress: 3370/4182 | EM: 73.32% | F1: 75.13%


Evaluating:  81%|████████  | 3380/4182 [49:16<11:24,  1.17it/s]

Progress: 3380/4182 | EM: 73.34% | F1: 75.15%


Evaluating:  81%|████████  | 3390/4182 [49:31<21:37,  1.64s/it]

Progress: 3390/4182 | EM: 73.30% | F1: 75.11%


Evaluating:  81%|████████▏ | 3400/4182 [49:41<13:23,  1.03s/it]

Progress: 3400/4182 | EM: 73.35% | F1: 75.16%


Evaluating:  82%|████████▏ | 3410/4182 [49:52<23:06,  1.80s/it]

Progress: 3410/4182 | EM: 73.26% | F1: 75.11%


Evaluating:  82%|████████▏ | 3420/4182 [49:58<09:51,  1.29it/s]

Progress: 3420/4182 | EM: 73.33% | F1: 75.19%


Evaluating:  82%|████████▏ | 3430/4182 [50:08<11:15,  1.11it/s]

Progress: 3430/4182 | EM: 73.35% | F1: 75.21%


Evaluating:  82%|████████▏ | 3440/4182 [50:15<11:10,  1.11it/s]

Progress: 3440/4182 | EM: 73.34% | F1: 75.20%


Evaluating:  82%|████████▏ | 3450/4182 [50:26<14:29,  1.19s/it]

Progress: 3450/4182 | EM: 73.36% | F1: 75.22%


Evaluating:  83%|████████▎ | 3460/4182 [50:32<05:21,  2.25it/s]

Progress: 3460/4182 | EM: 73.38% | F1: 75.23%


Evaluating:  83%|████████▎ | 3470/4182 [50:52<24:33,  2.07s/it]

Progress: 3470/4182 | EM: 73.34% | F1: 75.19%


Evaluating:  83%|████████▎ | 3480/4182 [50:59<10:44,  1.09it/s]

Progress: 3480/4182 | EM: 73.36% | F1: 75.21%


Evaluating:  83%|████████▎ | 3490/4182 [51:04<09:00,  1.28it/s]

Progress: 3490/4182 | EM: 73.30% | F1: 75.15%


Evaluating:  84%|████████▎ | 3500/4182 [51:10<03:57,  2.87it/s]

Progress: 3500/4182 | EM: 73.34% | F1: 75.20%


Evaluating:  84%|████████▍ | 3510/4182 [51:12<02:46,  4.03it/s]

Progress: 3510/4182 | EM: 73.42% | F1: 75.27%


Evaluating:  84%|████████▍ | 3520/4182 [51:15<02:44,  4.03it/s]

Progress: 3520/4182 | EM: 73.49% | F1: 75.34%


Evaluating:  84%|████████▍ | 3530/4182 [51:18<04:07,  2.64it/s]

Progress: 3530/4182 | EM: 73.51% | F1: 75.35%


Evaluating:  85%|████████▍ | 3540/4182 [51:24<04:41,  2.28it/s]

Progress: 3540/4182 | EM: 73.53% | F1: 75.39%


Evaluating:  85%|████████▍ | 3550/4182 [51:30<04:47,  2.20it/s]

Progress: 3550/4182 | EM: 73.55% | F1: 75.41%


Evaluating:  85%|████████▌ | 3560/4182 [51:34<03:48,  2.72it/s]

Progress: 3560/4182 | EM: 73.62% | F1: 75.48%


Evaluating:  85%|████████▌ | 3570/4182 [51:38<04:23,  2.32it/s]

Progress: 3570/4182 | EM: 73.64% | F1: 75.50%


Evaluating:  86%|████████▌ | 3580/4182 [51:40<02:23,  4.19it/s]

Progress: 3580/4182 | EM: 73.69% | F1: 75.54%


Evaluating:  86%|████████▌ | 3590/4182 [51:42<02:16,  4.35it/s]

Progress: 3590/4182 | EM: 73.76% | F1: 75.61%


Evaluating:  86%|████████▌ | 3600/4182 [51:45<02:13,  4.35it/s]

Progress: 3600/4182 | EM: 73.83% | F1: 75.68%


Evaluating:  86%|████████▋ | 3610/4182 [51:49<06:06,  1.56it/s]

Progress: 3610/4182 | EM: 73.88% | F1: 75.72%


Evaluating:  87%|████████▋ | 3620/4182 [52:01<09:04,  1.03it/s]

Progress: 3620/4182 | EM: 73.81% | F1: 75.67%


Evaluating:  87%|████████▋ | 3630/4182 [52:16<13:50,  1.51s/it]

Progress: 3630/4182 | EM: 73.80% | F1: 75.66%


Evaluating:  87%|████████▋ | 3640/4182 [52:27<08:16,  1.09it/s]

Progress: 3640/4182 | EM: 73.87% | F1: 75.73%


Evaluating:  87%|████████▋ | 3650/4182 [52:41<07:40,  1.16it/s]

Progress: 3650/4182 | EM: 73.89% | F1: 75.74%


Evaluating:  88%|████████▊ | 3660/4182 [52:56<16:50,  1.94s/it]

Progress: 3660/4182 | EM: 73.85% | F1: 75.71%


Evaluating:  88%|████████▊ | 3670/4182 [53:08<12:41,  1.49s/it]

Progress: 3670/4182 | EM: 73.81% | F1: 75.67%


Evaluating:  88%|████████▊ | 3680/4182 [53:24<08:23,  1.00s/it]

Progress: 3680/4182 | EM: 73.75% | F1: 75.61%


Evaluating:  88%|████████▊ | 3690/4182 [53:44<15:15,  1.86s/it]

Progress: 3690/4182 | EM: 73.74% | F1: 75.60%


Evaluating:  88%|████████▊ | 3700/4182 [53:51<05:20,  1.50it/s]

Progress: 3700/4182 | EM: 73.76% | F1: 75.64%


Evaluating:  89%|████████▊ | 3710/4182 [53:55<04:39,  1.69it/s]

Progress: 3710/4182 | EM: 73.83% | F1: 75.70%


Evaluating:  89%|████████▉ | 3720/4182 [53:58<01:53,  4.06it/s]

Progress: 3720/4182 | EM: 73.90% | F1: 75.77%


Evaluating:  89%|████████▉ | 3730/4182 [54:03<07:49,  1.04s/it]

Progress: 3730/4182 | EM: 73.94% | F1: 75.82%


Evaluating:  89%|████████▉ | 3740/4182 [54:24<17:33,  2.38s/it]

Progress: 3740/4182 | EM: 73.85% | F1: 75.73%


Evaluating:  90%|████████▉ | 3750/4182 [54:38<16:58,  2.36s/it]

Progress: 3750/4182 | EM: 73.73% | F1: 75.61%


Evaluating:  90%|████████▉ | 3760/4182 [54:50<08:15,  1.17s/it]

Progress: 3760/4182 | EM: 73.64% | F1: 75.52%


Evaluating:  90%|█████████ | 3770/4182 [55:04<11:32,  1.68s/it]

Progress: 3770/4182 | EM: 73.55% | F1: 75.44%


Evaluating:  90%|█████████ | 3780/4182 [55:14<08:23,  1.25s/it]

Progress: 3780/4182 | EM: 73.47% | F1: 75.38%


Evaluating:  91%|█████████ | 3790/4182 [55:23<09:01,  1.38s/it]

Progress: 3790/4182 | EM: 73.51% | F1: 75.41%


Evaluating:  91%|█████████ | 3800/4182 [55:41<11:22,  1.79s/it]

Progress: 3800/4182 | EM: 73.47% | F1: 75.38%


Evaluating:  91%|█████████ | 3810/4182 [55:51<07:03,  1.14s/it]

Progress: 3810/4182 | EM: 73.46% | F1: 75.37%


Evaluating:  91%|█████████▏| 3820/4182 [55:54<01:54,  3.15it/s]

Progress: 3820/4182 | EM: 73.48% | F1: 75.38%


Evaluating:  92%|█████████▏| 3830/4182 [55:57<01:22,  4.27it/s]

Progress: 3830/4182 | EM: 73.55% | F1: 75.44%


Evaluating:  92%|█████████▏| 3840/4182 [55:59<01:18,  4.35it/s]

Progress: 3840/4182 | EM: 73.62% | F1: 75.51%


Evaluating:  92%|█████████▏| 3850/4182 [56:01<01:16,  4.33it/s]

Progress: 3850/4182 | EM: 73.69% | F1: 75.57%


Evaluating:  92%|█████████▏| 3860/4182 [56:10<05:58,  1.11s/it]

Progress: 3860/4182 | EM: 73.68% | F1: 75.56%


Evaluating:  93%|█████████▎| 3870/4182 [56:17<05:10,  1.00it/s]

Progress: 3870/4182 | EM: 73.70% | F1: 75.57%


Evaluating:  93%|█████████▎| 3880/4182 [56:21<01:58,  2.54it/s]

Progress: 3880/4182 | EM: 73.76% | F1: 75.63%


Evaluating:  93%|█████████▎| 3890/4182 [56:28<06:45,  1.39s/it]

Progress: 3890/4182 | EM: 73.78% | F1: 75.64%


Evaluating:  93%|█████████▎| 3900/4182 [56:38<06:13,  1.32s/it]

Progress: 3900/4182 | EM: 73.74% | F1: 75.62%


Evaluating:  93%|█████████▎| 3910/4182 [56:44<01:55,  2.36it/s]

Progress: 3910/4182 | EM: 73.79% | F1: 75.66%


Evaluating:  94%|█████████▎| 3920/4182 [56:50<01:48,  2.41it/s]

Progress: 3920/4182 | EM: 73.80% | F1: 75.68%


Evaluating:  94%|█████████▍| 3930/4182 [56:53<01:20,  3.12it/s]

Progress: 3930/4182 | EM: 73.87% | F1: 75.74%


Evaluating:  94%|█████████▍| 3940/4182 [56:57<01:47,  2.25it/s]

Progress: 3940/4182 | EM: 73.86% | F1: 75.76%


Evaluating:  94%|█████████▍| 3950/4182 [57:03<01:33,  2.49it/s]

Progress: 3950/4182 | EM: 73.90% | F1: 75.80%


Evaluating:  95%|█████████▍| 3960/4182 [57:09<01:30,  2.46it/s]

Progress: 3960/4182 | EM: 73.94% | F1: 75.83%


Evaluating:  95%|█████████▍| 3970/4182 [57:12<01:07,  3.13it/s]

Progress: 3970/4182 | EM: 73.98% | F1: 75.87%


Evaluating:  95%|█████████▌| 3980/4182 [57:23<01:49,  1.84it/s]

Progress: 3980/4182 | EM: 73.97% | F1: 75.86%


Evaluating:  95%|█████████▌| 3990/4182 [57:25<00:49,  3.87it/s]

Progress: 3990/4182 | EM: 74.01% | F1: 75.90%


Evaluating:  96%|█████████▌| 4000/4182 [57:31<01:00,  3.02it/s]

Progress: 4000/4182 | EM: 74.05% | F1: 75.94%


Evaluating:  96%|█████████▌| 4010/4182 [57:34<00:42,  4.01it/s]

Progress: 4010/4182 | EM: 74.06% | F1: 75.95%


Evaluating:  96%|█████████▌| 4020/4182 [57:41<02:56,  1.09s/it]

Progress: 4020/4182 | EM: 74.08% | F1: 75.96%


Evaluating:  96%|█████████▋| 4030/4182 [57:56<03:36,  1.43s/it]

Progress: 4030/4182 | EM: 74.12% | F1: 76.00%


Evaluating:  97%|█████████▋| 4040/4182 [58:07<02:29,  1.05s/it]

Progress: 4040/4182 | EM: 74.11% | F1: 75.98%


Evaluating:  97%|█████████▋| 4050/4182 [58:24<03:38,  1.66s/it]

Progress: 4050/4182 | EM: 74.02% | F1: 75.89%


Evaluating:  97%|█████████▋| 4060/4182 [58:40<02:13,  1.09s/it]

Progress: 4060/4182 | EM: 74.01% | F1: 75.88%


Evaluating:  97%|█████████▋| 4070/4182 [58:48<01:10,  1.58it/s]

Progress: 4070/4182 | EM: 74.03% | F1: 75.90%


Evaluating:  98%|█████████▊| 4080/4182 [58:55<01:22,  1.24it/s]

Progress: 4080/4182 | EM: 74.04% | F1: 75.93%


Evaluating:  98%|█████████▊| 4090/4182 [58:59<00:33,  2.76it/s]

Progress: 4090/4182 | EM: 74.08% | F1: 75.96%


Evaluating:  98%|█████████▊| 4100/4182 [59:03<00:30,  2.69it/s]

Progress: 4100/4182 | EM: 74.12% | F1: 76.00%


Evaluating:  98%|█████████▊| 4110/4182 [59:15<01:02,  1.15it/s]

Progress: 4110/4182 | EM: 74.06% | F1: 75.96%


Evaluating:  99%|█████████▊| 4120/4182 [59:25<01:06,  1.07s/it]

Progress: 4120/4182 | EM: 74.05% | F1: 75.96%


Evaluating:  99%|█████████▉| 4130/4182 [59:29<00:22,  2.26it/s]

Progress: 4130/4182 | EM: 74.04% | F1: 75.95%


Evaluating:  99%|█████████▉| 4140/4182 [59:33<00:13,  3.20it/s]

Progress: 4140/4182 | EM: 74.11% | F1: 76.00%


Evaluating:  99%|█████████▉| 4150/4182 [59:39<00:23,  1.34it/s]

Progress: 4150/4182 | EM: 74.07% | F1: 75.97%


Evaluating:  99%|█████████▉| 4160/4182 [59:48<00:24,  1.13s/it]

Progress: 4160/4182 | EM: 74.06% | F1: 75.96%


Evaluating: 100%|█████████▉| 4170/4182 [1:00:00<00:10,  1.19it/s]

Progress: 4170/4182 | EM: 74.08% | F1: 75.97%


Evaluating: 100%|█████████▉| 4180/4182 [1:00:08<00:02,  1.49s/it]

Progress: 4180/4182 | EM: 74.04% | F1: 75.94%


Evaluating: 100%|██████████| 4182/4182 [1:00:09<00:00,  1.16it/s]


EVALUATION RESULTS
Model: ./cuad_finetuned_llama3_2_3b
Samples Evaluated: 4182
Exact Match: 74.06%
F1 Score: 75.95%
Evaluation Time: 60.15 minutes
Speed: 1.16 samples/second
📁 Predictions saved to: ./evaluation_results/predictions_cuad_finetuned_llama3_2_3b_20250608_053951.json
📁 Detailed results saved to: ./evaluation_results/results_cuad_finetuned_llama3_2_3b_20250608_053951.json

📊 COMPARISON SUMMARY
Model           Exact Match     F1 Score        Samples    Time (min)  
--------------------------------------------------------------------------------
Original        27.14           32.24           4182       167.00      
Fine-tuned      74.06           75.95           4182       60.15       

🎯 IMPROVEMENT:
Exact Match: +46.92 percentage points
F1 Score: +43.71 percentage points

✅ Evaluation completed! Results saved in: ./evaluation_results





In [None]:
!zip -r evaluation_results.zip /content/evaluation_results

updating: content/evaluation_results/ (stored 0%)
  adding: content/evaluation_results/results_cuad_finetuned_llama3_2_3b_20250608_053951.json (deflated 93%)
  adding: content/evaluation_results/predictions_Llama-3.2-3B-Instruct-bnb-4bit_20250608_043934.json (deflated 85%)
  adding: content/evaluation_results/results_Llama-3.2-3B-Instruct-bnb-4bit_20250608_043934.json (deflated 92%)
  adding: content/evaluation_results/predictions_cuad_finetuned_llama3_2_3b_20250608_053951.json (deflated 91%)


In [None]:
from google.colab import files
files.download('evaluation_results.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>