In [1]:
from transformers import BertForSequenceClassification, BertTokenizerFast
from datasets import load_dataset
from transformers import TrainingArguments, Trainer
import wandb

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Define a function to preprocess data for the model
def tokenize_function(examples, tokenizer):
    tokenized = tokenizer(examples["utt"], padding="max_length", truncation=True)
    tokenized['label'] = examples['label']
    return tokenized

In [5]:
def regular_finetune_bert(model_name = "bert-base-uncased"):
    wandb.init(project=f"nlphw2_{model_name}_regular-finetuning")
    # Load the Amazon Science Massive dataset (English)
    train_val_test = load_dataset("AmazonScience/massive", 'en-US', cache_dir="/scratch/afz225/.cache").rename_columns({"intent":"label"})
    train_dataset = train_val_test["train"]
    val_dataset = train_val_test["validation"]
    test_dataset = train_val_test["test"]
    tokenizer = BertTokenizerFast.from_pretrained(model_name)
    model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(train_dataset.features['label'].names))  # Adjust num_labels for your classification task

    # Preprocess training and validation data
    train_dataset = train_dataset.map(tokenize_function,fn_kwargs={"tokenizer":tokenizer}, batched=True)
    val_dataset = val_dataset.map(tokenize_function,fn_kwargs={"tokenizer":tokenizer}, batched=True)
    test_dataset = test_dataset.map(tokenize_function,fn_kwargs={"tokenizer":tokenizer}, batched=True)
    
    # Training arguments
    training_args = TrainingArguments(
        output_dir="./results-"+model_name,  # Adjust output directory
        overwrite_output_dir=True,
        num_train_epochs=3,  
        per_device_train_batch_size=16,  
        save_steps=500,
        eval_steps=500,
        logging_steps=50,
        load_best_model_at_end=True,
        save_total_limit=2,
        save_strategy="steps",
        evaluation_strategy="steps",
        warmup_ratio=0.1,
        report_to="wandb"
    )

    # Create the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
    )

    # Start training
    trainer.train()
    
    # Save the fine-tuned model
    trainer.save_model(f"./regular-fine-tuned_{model_name}")  # Adjust save directory
    return trainer.evaluate(test_dataset)

In [6]:
regular_finetune_bert()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 2974/2974 [00:00<00:00, 3434.80 examples/s]
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss,Validation Loss
500,1.0457,0.876734
1000,0.4717,0.553591
1500,0.2249,0.478846
2000,0.1768,0.467182


{'eval_loss': 0.48236119747161865,
 'eval_runtime': 25.405,
 'eval_samples_per_second': 117.063,
 'eval_steps_per_second': 14.643,
 'epoch': 3.0}