In [2]:
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


1. Loading the Dataset

In [44]:
imdb = load_dataset("imdb")


In [45]:
# Lets reduce the size of the dataset to avoid massive training time. 
import random

for dataset in imdb:
    print(imdb[dataset])
    imdb[dataset] = imdb[dataset].select([random.randint(0, 24900) for _ in range(5000)])

imdb

Dataset({
    features: ['text', 'label'],
    num_rows: 25000
})
Dataset({
    features: ['text', 'label'],
    num_rows: 25000
})
Dataset({
    features: ['text', 'label'],
    num_rows: 50000
})


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 5000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 5000
    })
})

In [None]:
# check if ones and zeros are equally distributed. 
ones, zeros =0,0
for i in imdb['train']:
    if i['label']==0:
        zeros+=1
    else:
        ones+=1
ones, zeros

(2478, 2522)

In [47]:
imdb['train'][500]

{'text': 'This movie was a brilliant concept. It was original, cleverly written and of high appeal to those of us who aren\'t really \'conformist\' movie pickers. Don\'t get me wrong - there are some great movies that have wide appeal, but when you move into watching a movie based on "everyone else is watching it" - you know you\'re either a tween or don\'t really have an opinion. This had a lovely subtle humor - despite most people probably looking only at the obvious. The actors portrayed their characters with aplomb and I thought there was a lot more "personal" personality in this film. Has appeal for kids, as well as adults. Esp. nice to find a good movie that\'s not filled with sexual references and drug innuendos! A great film, not to be overlooked based on public consumption. This one is a must buy.',
 'label': 1}

In [5]:
imdb['train'][0]['text']

'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far between, ev

2. Loading the Tokenizer and applying it to whole dataset

In [48]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")


In [49]:
tokenizer

DistilBertTokenizerFast(name_or_path='distilbert/distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [50]:
# You create a function with respect to one example only, and the MAP function automatically maps it to the rest of the Dataset. 
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True)

tokenized_imdb = imdb.map(preprocess_function, batched=True)


Map: 100%|██████████| 5000/5000 [00:00<00:00, 8076.98 examples/s]
Map: 100%|██████████| 5000/5000 [00:00<00:00, 7809.33 examples/s]
Map: 100%|██████████| 5000/5000 [00:00<00:00, 8533.51 examples/s]


In [51]:
type(imdb)

datasets.dataset_dict.DatasetDict

3. Defining Collator for the datatset. Tokenizer only applied truncation yet. Not padding! So collator is needed for padding. 

In [52]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

4. Define Evaluation Metric

In [53]:
import evaluate

accuracy = evaluate.load('accuracy')

# Usage: accuracy.compute(predictions, labels)

In [54]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [55]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [56]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased", 
                                                           num_labels = 2, 
                                                           id2label = id2label, 
                                                           label2id = label2id
                                                            )

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [59]:
training_args = TrainingArguments(
    output_dir="output",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_imdb["train"],
    eval_dataset=tokenized_imdb["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


In [60]:
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.243971,0.9078


TrainOutput(global_step=313, training_loss=0.3151430611412365, metrics={'train_runtime': 492.231, 'train_samples_per_second': 10.158, 'train_steps_per_second': 0.636, 'total_flos': 654807049587648.0, 'train_loss': 0.3151430611412365, 'epoch': 1.0})

In [65]:
# Lets say we have run the fine tuning. Now for inference I am using simple pretrained model with pipeline

from transformers import pipeline


# model_inference = AutoModelForSequenceClassification.from_pretrained("output/checkpoint-313")
# tokenizer_inference = AutoTokenizer.from_pretrained("output")

classifier = pipeline("sentiment-analysis", model="output/checkpoint-313")
test_text = "This is a good movie"

classifier(test_text)

Device set to use mps:0


[{'label': 'POSITIVE', 'score': 0.9562123417854309}]

In [66]:
# Inference using pytorch


inference_tokenizer = AutoTokenizer.from_pretrained("output/checkpoint-313")


In [67]:
inference_model = AutoModelForSequenceClassification.from_pretrained("output/checkpoint-313")


In [76]:
import torch

input_text = "This is an astonishing movie! Did not expect this from such an obscure actor"
inputs = inference_tokenizer(input_text, return_tensors="pt")

with torch.no_grad():
    logits = inference_model(**inputs).logits

predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]


'POSITIVE'

In [79]:
# applying softmax to the logits to see the class probabilites. 

softmax = torch.nn.Softmax(dim=1)
softmax(logits)

tensor([[0.0830, 0.9170]])

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
