# Fine Tune Model on Glue SST2

In [1]:
import torch
import numpy as np
import evaluate
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

  from pandas.core.computation.check import NUMEXPR_INSTALLED


## Preprocess dataset

In [2]:
sst2_datasets = load_dataset('glue', 'sst2')

In [3]:
sst2_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [4]:
checkpoint = 'bert-base-cased'

In [5]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [6]:
def tokenize(example):
    return tokenizer(example['sentence'], truncation=True)

In [7]:
tokenized_datasets = sst2_datasets.map(tokenize, batched=True)

In [8]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [9]:
training_args = TrainingArguments('test-trainer-sst2', evaluation_strategy='epoch', per_device_train_batch_size=32, per_device_eval_batch_size=64)

In [10]:
metric = evaluate.load('glue', 'sst2')

In [11]:
def compute_metrics(eval_example):
    metric = evaluate.load('glue', 'sst2')
    logits, labels = eval_example
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [12]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
trainer = Trainer(
    model = model,
    args = training_args, 
    train_dataset = tokenized_datasets['train'],
    eval_dataset = tokenized_datasets['validation'],
    tokenizer = tokenizer, 
    compute_metrics = compute_metrics
)

In [14]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1782,0.246972,0.905963
2,0.1142,0.271228,0.924312
3,0.0637,0.303524,0.916284


TrainOutput(global_step=6315, training_loss=0.13172928212846166, metrics={'train_runtime': 707.1859, 'train_samples_per_second': 285.706, 'train_steps_per_second': 8.93, 'total_flos': 4417483022304300.0, 'train_loss': 0.13172928212846166, 'epoch': 3.0})