In [1]:
from datasets import load_dataset
imdb = load_dataset("imdb")

In [2]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")



In [3]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True,max_length=512)

In [4]:
tokenized_imdb = imdb.map(preprocess_function, batched=True)

In [5]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

2024-03-02 17:41:04.402015: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-02 17:41:04.478762: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
import evaluate
accuracy = evaluate.load("accuracy")

In [7]:
import numpy as np
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [8]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [9]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
trainer = Trainer(
    model=model,                         
    eval_dataset=tokenized_imdb["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


In [15]:
eval_result = trainer.evaluate(eval_dataset=tokenized_imdb["test"])

In [16]:
print(f"Base model accuracy : {eval_result['eval_accuracy']}")

Base model accuracy : 0.4964
