In [1]:
import evaluate 
import numpy as np
import torch
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
from transformers import Trainer
from transformers import TrainingArguments
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
raw_datasets = load_dataset("imdb")


In [3]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [4]:
pretrained_model_id = "bert-base-uncased"

In [5]:
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_id)
tokenizer("Hello, this is a text I want to be tokenized")

{'input_ids': [101, 7592, 1010, 2023, 2003, 1037, 3793, 1045, 2215, 2000, 2022, 19204, 3550, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [6]:
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

In [7]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [8]:
from datasets import load_metric

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    
    accuracy_metric = load_metric("accuracy")
    f1_metric = load_metric("f1")

    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="macro")  # for macro F1

    return {"accuracy": accuracy["accuracy"], "f1": f1["f1"]}

In [9]:
training_args = TrainingArguments(
    "test-trainer",
    evaluation_strategy="epoch",
    per_device_eval_batch_size=12,
    per_device_train_batch_size=12,
    fp16=True,
    num_train_epochs=1
)

# Transfer Learning

In [10]:
# clear VRAM GPU cache
torch.cuda.empty_cache()

model = AutoModelForSequenceClassification.from_pretrained(
    pretrained_model_id,
    num_labels=2
)
model.cuda()
print(model)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [11]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [12]:
# clear VRAM GPU cache
torch.cuda.empty_cache()

trainer.train()

 24%|██▍       | 500/2084 [02:46<08:47,  3.00it/s]Checkpoint destination directory test-trainer/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.3697, 'learning_rate': 3.807581573896353e-05, 'epoch': 0.24}


 48%|████▊     | 1000/2084 [05:35<06:03,  2.98it/s]Checkpoint destination directory test-trainer/checkpoint-1000 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.2724, 'learning_rate': 2.607965451055662e-05, 'epoch': 0.48}


 72%|███████▏  | 1500/2084 [08:26<03:16,  2.97it/s]Checkpoint destination directory test-trainer/checkpoint-1500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.2424, 'learning_rate': 1.4083493282149713e-05, 'epoch': 0.72}


 96%|█████████▌| 2000/2084 [11:14<00:28,  2.97it/s]Checkpoint destination directory test-trainer/checkpoint-2000 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.2136, 'learning_rate': 2.0873320537428026e-06, 'epoch': 0.96}


  accuracy_metric = load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.

Downloading builder script: 6.50kB [00:00, 4.13MB/s]                   
                                                   
100%|██████████| 2084/2084 [15:36<00:00,  2.23it/s]

{'eval_loss': 0.18620875477790833, 'eval_accuracy': 0.93908, 'eval_f1': 0.9390788840226291, 'eval_runtime': 231.9936, 'eval_samples_per_second': 107.762, 'eval_steps_per_second': 8.983, 'epoch': 1.0}
{'train_runtime': 936.3146, 'train_samples_per_second': 26.7, 'train_steps_per_second': 2.226, 'train_loss': 0.27131269714882644, 'epoch': 1.0}





TrainOutput(global_step=2084, training_loss=0.27131269714882644, metrics={'train_runtime': 936.3146, 'train_samples_per_second': 26.7, 'train_steps_per_second': 2.226, 'train_loss': 0.27131269714882644, 'epoch': 1.0})

In [13]:
model.evaluate()

AttributeError: 'BertForSequenceClassification' object has no attribute 'evaluate'

In [15]:
def classify_text(text):
    inputs = tokenizer(text, truncation=True, padding=True, return_tensors="pt").to("cuda")
    outputs = model(**inputs)

    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    predicted_class = torch.argmax(probs).item()
    return predicted_class

print(classify_text("Critters is a cool movie."))
print(classify_text("Titanic is a horrible movie."))

1
0
