## Install Libraries

In [17]:
# !pip3 install -U accelerate
# !pip3 install -U transformers
# !pip3 install -U torch

## Prepare the Dataset

In [2]:
from datasets import load_dataset


In [3]:
dataset = load_dataset("imdb")

In [4]:
dataset['train'][50]

{'text': 'I saw this film opening weekend in Australia, anticipating with an excellent cast of Ledger, Edgerton, Bloom, Watts and Rush that the definitive story of Ned Kelly would unfold before me. Unfortunately, despite an outstanding performance by Heath Ledger in the lead role, the plot was paper thin....which doesn\'t inspire me to read "Our Sunshine". There were some other plus points, the support acting from Edgerton in particular, assured direction from Jordan (confirming his talent on show in Buffalo Soldiers as well), and production design that gave a real feel of harshness to the Australian bush, much as the Irish immigrants of the early 19th century must have seen it. But I can\'t help feeling that another opportunity has been missed to tell the real story of an Australian folk hero (or was he?)....in what I suspect is a concession to Hollywood and selling the picture in the US. Oh well, at least Jordan and the producers didn\'t agree to lose the beards just to please Univer

## Tokenizer

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [6]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

## Model Decision

In [7]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2) #we have 2 classes: pos & neg

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Training Arguments

In [18]:
from transformers import TrainingArguments

training_args = TrainingArguments(output_dir="test_trainer") #this is where to tune the hyperparameters

## Training

In [11]:
from transformers import Trainer

In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
)

In [13]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=375, training_loss=0.49599308268229164, metrics={'train_runtime': 465.6358, 'train_samples_per_second': 6.443, 'train_steps_per_second': 0.805, 'total_flos': 789333166080000.0, 'train_loss': 0.49599308268229164, 'epoch': 3.0})

In [14]:
#this took ~7minutes

## Saving the Model

In [15]:
model.save_pretrained("./fine_tuned_models/")