## Evaluating AbLMs on test tests

In [None]:
from datasets import load_dataset
from transformers import (
    EsmTokenizer,
    EsmForMaskedLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
)
import torch

In [None]:
##Step 1: Load and prepare your dataset
# This path specifies the test dataset.
# You can change it to evaluate the model's Cross-Entropy (CE) loss on other available test sets.

dataset = load_dataset(
    'csv',
    data_files={'test': './data/test/test_dataset.csv'}


# Step 2: Load the tokenizer and model
tokenizer = EsmTokenizer.from_pretrained("facebook/esm2_t6_8M_UR50D")
# Load your pre-trained model checkpoint 


# To evaluate a specific model checkpoint, **change the path below**.
# This will calculate its Cross-Entropy (CE) loss on the test dataset defined above.

model_path = './01all_esm_models/deepspeed/esm/all_checkpoints_4good/m_150M_full_batch_128_2025-02-11/checkpoint-500000'
model = EsmForMaskedLM.from_pretrained(model_path)

# Step 3: Define parameters consistent with training
MAX_LEN = 320  # or use the same value as your training config (e.g., train_config["max_length"])
SEPARATOR = "<cls><cls>"  # or use your training separator, e.g., train_config["separator_token"]

# Step 4: Tokenize the dataset using a function similar to your training preprocess
def preprocess_function(example):
    # Combine the heavy and light chain sequences
    sequence = example['sequence_aa_heavy'] + SEPARATOR + example['sequence_aa_light']
    # Tokenize with the same settings used during training
    tokenized = tokenizer(
        sequence,
        padding='max_length',
        truncation=True,
        max_length=MAX_LEN,
        add_special_tokens=True,
    )
    # Add special tokens mask if required (as done in training)
    tokenized['special_tokens_mask'] = tokenizer.get_special_tokens_mask(tokenized['input_ids'], already_has_special_tokens=True)
    return tokenized

# Map the tokenization function to the test dataset
tokenized_datasets = dataset.map(preprocess_function, batched=False)
eval_dataset = tokenized_datasets['test']

# Step 5: Prepare the data collator (using the one from training)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15  # Use the same probability as in training
)

# Step 6: Set up training/evaluation arguments
training_args = TrainingArguments(
    output_dir='./results',
    per_device_eval_batch_size=32,
    logging_dir='./logs',
    do_eval=True,
    evaluation_strategy="no",
    report_to="none",  # Explicitly disable W&B logging
)

# Step 7: Initialize the Trainer with the data collator and model
trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
)

# Step 8: Run evaluation (just once)
eval_results = trainer.evaluate()

# grab the single eval_loss (average cross‑entropy per token)
loss = eval_results['eval_loss']
print(f"Cross‑Entropy Loss (eval_loss): {loss:.4f}")

