In [2]:
from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast, Trainer, TrainingArguments, AutoConfig
from datasets import load_dataset


# Load the pre-trained DistilBERT model and tokenizer
# model_name = "distilbert-base-uncased"
local_model_path="/finetune/pretrained"
config = AutoConfig.from_pretrained("/finetune/pretrained/config.json")
model = DistilBertForSequenceClassification.from_pretrained(local_model_path, num_labels=2)
tokenizer = DistilBertTokenizerFast.from_pretrained(local_model_path)

# Load the IMDb movie review dataset
dataset = load_dataset("dataset/imdb.py")


# Tokenize the dataset
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True, max_length = 512)


# Process the dataset
train_data = dataset['train'].map(tokenize, batched=True)
test_data = dataset['test'].map(tokenize, batched=True)


  from .autonotebook import tqdm as notebook_tqdm
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at /finetune/pretrained and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
print("TRAIN:", train_data)
# print(test_data)

TRAIN: Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 25000
})


In [4]:
# Define training parameters
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    evaluation_strategy='epoch',
)

# Define the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data
)


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [5]:
trainer.train()



Epoch,Training Loss,Validation Loss
1,0.2556,0.227953
2,0.1295,0.198828
3,0.0616,0.247296




TrainOutput(global_step=1563, training_loss=0.14470853656053695, metrics={'train_runtime': 2704.539, 'train_samples_per_second': 27.731, 'train_steps_per_second': 0.578, 'total_flos': 9935054899200000.0, 'train_loss': 0.14470853656053695, 'epoch': 3.0})

In [6]:
model.save_pretrained('fine_tuned_model')
tokenizer.save_pretrained('fine_tuned_model')

('fine_tuned_model/tokenizer_config.json',
 'fine_tuned_model/special_tokens_map.json',
 'fine_tuned_model/vocab.txt',
 'fine_tuned_model/added_tokens.json',
 'fine_tuned_model/tokenizer.json')