# Installation

In [2]:
!pip install transformers datasets evaluate
!pip install accelerate -U


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


Restart Runtime

# DataSet

In [10]:
from datasets import load_dataset

imdb = load_dataset("imdb", split="train")
imdb = imdb.train_test_split(test_size=0.025, train_size=0.05, stratify_by_column="label")

In [13]:
imdb

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 1250
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 625
    })
})

In [14]:
imdb["train"][0]

{'text': "Ugh. This is a terrible film, full of disastrous comic relief, no scares, and scary leaps in story and plotline. The only creepy thing here is the leading lady's hats. Lugosi was on his downhill slide and it shows. I give this a 1, and this ain't no fun.",
 'label': 0}

In [15]:
imdb["train"].features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['neg', 'pos'], id=None)}

# Tokenizer

In [16]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [17]:
tokenizer("This is a test and a simple example")

{'input_ids': [101, 2023, 2003, 1037, 3231, 1998, 1037, 3722, 2742, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [16]:
imdb_tokenized = imdb.map(lambda item: tokenizer(item["text"], truncation=True), batched=True)

Map: 100%|███████████████████████████████████████████████████████████████████████████████████████| 25000/25000 [00:02<00:00, 8696.22 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████████████████████████████| 25000/25000 [00:02<00:00, 8526.86 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████████████████████████████| 50000/50000 [00:05<00:00, 8418.14 examples/s]


# Training

## Data Tokenization

In [25]:
imdb_tokenized = imdb.map(lambda x: tokenizer(x["text"], truncation=True))

Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1250/1250 [00:00<00:00, 2101.88 examples/s]


## Label <-> ID mappings

In [19]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

## Metrics

In [20]:
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


## Model for training

In [21]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Training Arguments

In [22]:
training_args = TrainingArguments(
    output_dir="out_sentiment_analysis4",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_steps=32
)

In [23]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=imdb_tokenized["train"],
    eval_dataset=imdb_tokenized["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
