This notebook was executed in Google Colab using A100-GPU

### Start of execution

In [1]:
import time

In [2]:
start = time.time()

# 1. Setting the environment

In [3]:
!pip install -q datasets==2.20.0

# 2. Import libraries

In [4]:
import warnings
warnings.filterwarnings("ignore")

In [5]:
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

# 3. Preparation

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
path_general = 'drive/MyDrive/Profesional_Academico/Github_Personal/ML_AI_Contents/09.Deep_Learning/50.BERT_HF_Trainer'

# 3. Load Dataset

In [8]:
imdb = load_dataset("imdb")

# 4. Load model

In [9]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = 2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# 5. Creation of datasets

In [11]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation = True)

In [12]:
tokenized_imdb = imdb.map(preprocess_function, batched = True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [13]:
data_collator = DataCollatorWithPadding(tokenizer = tokenizer)

# 6. Training

In [14]:
num_epochs = 1

In [15]:
training_args = TrainingArguments(
    output_dir = "./results",
    learning_rate = 2e-5,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    num_train_epochs = num_epochs,
    weight_decay = 0.01,
    evaluation_strategy = "epoch",
    save_steps = 0,
    logging_steps = 25,
)

In [16]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_imdb["train"],
    eval_dataset = tokenized_imdb["test"],
    tokenizer = tokenizer,
    data_collator = data_collator
    )

In [17]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.2189,0.178815


TrainOutput(global_step=1563, training_loss=0.2410340388463387, metrics={'train_runtime': 656.4657, 'train_samples_per_second': 38.083, 'train_steps_per_second': 2.381, 'total_flos': 6515172397265280.0, 'train_loss': 0.2410340388463387, 'epoch': 1.0})

# 7. Save the model

In [18]:
trainer.save_model(f'{path_general}/model')

### End of execution

In [19]:
end = time.time()

delta = (end - start)

hours = int(delta/3_600)
mins = int((delta - hours*3_600)/60)
secs = int(delta - hours*3_600 - mins*60)

print(f'Hours: {hours}, Minutes: {mins}, Seconds: {secs}')

Hours: 0, Minutes: 11, Seconds: 31
