In [6]:
from datasets import load_dataset

# Suppose your CSV has two columns: "id" and "sequence"
dataset = load_dataset(
    "csv",
    data_files={
        "train": "train.csv",        # or "s3://my-bucket/antibody/train.csv"
        "validation": "val.csv"      # optional
    }
)

Generating train split: 792287 examples [00:03, 260391.96 examples/s]
Generating validation split: 37019 examples [00:00, 252104.17 examples/s]


In [7]:
from transformers import EsmForMaskedLM, EsmTokenizer

model_name = "facebook/esm2_t6_8M_UR50D"
tokenizer = EsmTokenizer.from_pretrained(model_name)
model = EsmForMaskedLM.from_pretrained(model_name)

In [8]:
def tokenize_function(examples):
    return tokenizer(examples["sequence"], truncation=True, max_length=1024)

tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["sequence", "id"]  # remove the original columns
)

Map:   0%|          | 0/792287 [00:00<?, ? examples/s]

Map: 100%|██████████| 792287/792287 [21:26<00:00, 615.92 examples/s] 
Map: 100%|██████████| 37019/37019 [00:21<00:00, 1741.06 examples/s]


In [10]:
from transformers import DataCollatorForLanguageModeling

mlm_probability = 0.15  # 15% tokens masked randomly
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=mlm_probability
)


In [11]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./esm-finetuned-mlm",
    evaluation_strategy="epoch",  # or "steps"
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    logging_steps=50,
    save_steps=500,
    report_to="none"  # or "tensorboard"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator
)

trainer.train()


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 