In [2]:
import transformers
import torch

  from .autonotebook import tqdm as notebook_tqdm


## 1. UTILIZE FINETUNED MODEL WITH PIPELINE:

In [3]:
from transformers import pipeline

sentiment = pipeline(model = 'cardiffnlp/twitter-roberta-base-sentiment')

In [4]:
sentiment("I love my phone.")

[{'label': 'LABEL_2', 'score': 0.9864782094955444}]

In [5]:
sentiment("I hate my phone.")

[{'label': 'LABEL_0', 'score': 0.9747684597969055}]

## 2. FINETUNE SPECIFIC MODEL FROM SCRATCH:

In [6]:
import torch 
torch.cuda.is_available()

False

In [7]:
from datasets import load_dataset
import pandas as pd
imdb = load_dataset('imdb')

Downloading builder script: 100%|██████████| 4.31k/4.31k [00:00<?, ?B/s]
Downloading metadata: 100%|██████████| 2.17k/2.17k [00:00<?, ?B/s]
Downloading readme: 100%|██████████| 7.59k/7.59k [00:00<?, ?B/s]
Downloading data: 100%|██████████| 84.1M/84.1M [00:39<00:00, 2.12MB/s]  
Generating train split: 100%|██████████| 25000/25000 [00:11<00:00, 2148.02 examples/s]
Generating test split: 100%|██████████| 25000/25000 [00:11<00:00, 2248.08 examples/s]
Generating unsupervised split: 100%|██████████| 50000/50000 [00:12<00:00, 3864.19 examples/s]


In [8]:
imdb

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [9]:
imdb['train']['text'][2]

"If only to avoid making this type of film in the future. This film is interesting as an experiment but tells no cogent story.<br /><br />One might feel virtuous for sitting thru it because it touches on so many IMPORTANT issues but it does so without any discernable motive. The viewer comes away with no new perspectives (unless one comes up with one while one's mind wanders, as it will invariably do during this pointless film).<br /><br />One might better spend one's time staring out a window at a tree growing.<br /><br />"

In [10]:
imdb['train']['label'][2]

0

In [11]:
prediction = sentiment(imdb['train']['text'][2])
prediction

[{'label': 'LABEL_1', 'score': 0.4160052537918091}]

In [12]:
train_subset = imdb["train"].shuffle(seed=42).select(range(1000))
test_subset = imdb["test"].shuffle(seed=42).select(range(100))

In [13]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

pretrained_model_name = "distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)
model = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name, num_labels=2)

tokenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<?, ?B/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
config.json: 100%|██████████| 483/483 [00:00<?, ?B/s] 
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 1.82MB/s]
tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 7.29MB/s]
model.safetensors: 100%|██████████| 268M/268M [00:14<00:00, 18.3MB/s] 
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
train_subset['text'][2]

'George P. Cosmatos\' "Rambo: First Blood Part II" is pure wish-fulfillment. The United States clearly didn\'t win the war in Vietnam. They caused damage to this country beyond the imaginable and this movie continues the fairy story of the oh-so innocent soldiers. The only bad guys were the leaders of the nation, who made this war happen. The character of Rambo is perfect to notice this. He is extremely patriotic, bemoans that US-Americans didn\'t appreciate and celebrate the achievements of the single soldier, but has nothing but distrust for leading officers and politicians. Like every film that defends the war (e.g. "We Were Soldiers") also this one avoids the need to give a comprehensible reason for the engagement in South Asia. And for that matter also the reason for every single US-American soldier that was there. Instead, Rambo gets to take revenge for the wounds of a whole nation. It would have been better to work on how to deal with the memories, rather than suppressing them. 

In [None]:
tokens = tokenizer.tokenize(train_subset['text'][2])
tokens

['george',
 'p',
 '.',
 'co',
 '##sma',
 '##tos',
 "'",
 '"',
 'ram',
 '##bo',
 ':',
 'first',
 'blood',
 'part',
 'ii',
 '"',
 'is',
 'pure',
 'wish',
 '-',
 'fulfillment',
 '.',
 'the',
 'united',
 'states',
 'clearly',
 'didn',
 "'",
 't',
 'win',
 'the',
 'war',
 'in',
 'vietnam',
 '.',
 'they',
 'caused',
 'damage',
 'to',
 'this',
 'country',
 'beyond',
 'the',
 'im',
 '##agi',
 '##nable',
 'and',
 'this',
 'movie',
 'continues',
 'the',
 'fairy',
 'story',
 'of',
 'the',
 'oh',
 '-',
 'so',
 'innocent',
 'soldiers',
 '.',
 'the',
 'only',
 'bad',
 'guys',
 'were',
 'the',
 'leaders',
 'of',
 'the',
 'nation',
 ',',
 'who',
 'made',
 'this',
 'war',
 'happen',
 '.',
 'the',
 'character',
 'of',
 'ram',
 '##bo',
 'is',
 'perfect',
 'to',
 'notice',
 'this',
 '.',
 'he',
 'is',
 'extremely',
 'patriotic',
 ',',
 'be',
 '##mo',
 '##ans',
 'that',
 'us',
 '-',
 'americans',
 'didn',
 "'",
 't',
 'appreciate',
 'and',
 'celebrate',
 'the',
 'achievements',
 'of',
 'the',
 'single',
 '

In [None]:
tokenizer.encode(train_subset['text'][2])

[101,
 2577,
 1052,
 1012,
 2522,
 26212,
 13122,
 1005,
 1000,
 8223,
 5092,
 1024,
 2034,
 2668,
 2112,
 2462,
 1000,
 2003,
 5760,
 4299,
 1011,
 29362,
 1012,
 1996,
 2142,
 2163,
 4415,
 2134,
 1005,
 1056,
 2663,
 1996,
 2162,
 1999,
 5148,
 1012,
 2027,
 3303,
 4053,
 2000,
 2023,
 2406,
 3458,
 1996,
 10047,
 22974,
 22966,
 1998,
 2023,
 3185,
 4247,
 1996,
 8867,
 2466,
 1997,
 1996,
 2821,
 1011,
 2061,
 7036,
 3548,
 1012,
 1996,
 2069,
 2919,
 4364,
 2020,
 1996,
 4177,
 1997,
 1996,
 3842,
 1010,
 2040,
 2081,
 2023,
 2162,
 4148,
 1012,
 1996,
 2839,
 1997,
 8223,
 5092,
 2003,
 3819,
 2000,
 5060,
 2023,
 1012,
 2002,
 2003,
 5186,
 14314,
 1010,
 2022,
 5302,
 6962,
 2008,
 2149,
 1011,
 4841,
 2134,
 1005,
 1056,
 9120,
 1998,
 8439,
 1996,
 10106,
 1997,
 1996,
 2309,
 5268,
 1010,
 2021,
 2038,
 2498,
 2021,
 29245,
 2005,
 2877,
 3738,
 1998,
 8801,
 1012,
 2066,
 2296,
 2143,
 2008,
 6985,
 2015,
 1996,
 2162,
 1006,
 1041,
 1012,
 1043,
 1012,
 1000,
 2057,
 2020

In [None]:
tokenizer.decode(2034)

'first'

In [None]:
tokenizer.encode("I love my phone.", padding='max_length', truncation=True)

[101,
 1045,
 2293,
 2026,
 2564,
 1012,
 102,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 

In [19]:
tokenizer.decode(tokenizer.encode("I love my phone.", padding='max_length', truncation=True))

'[CLS] i love my phone. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD

In [20]:
def process(cases):
    #return tokenizer(cases['text'], padding='max_length', truncation=True)
    return tokenizer(cases['text'], truncation=True, max_length=10)

tokenized_train = train_subset.map(process, batched=True)
tokenized_test = test_subset.map(process, batched=True)


Map: 100%|██████████| 1000/1000 [00:00<00:00, 3012.79 examples/s]

Map: 100%|██████████| 1000/1000 [00:00<00:00, 2802.42 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 1330.60 examples/s]


In [21]:
tokenized_train

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 1000
})

In [22]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [23]:
import numpy as np
from datasets import load_metric
 
def compute_metrics(eval_pred):
   load_accuracy = load_metric("accuracy")
   load_f1 = load_metric("f1")
  
   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
   f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
   return {"accuracy": accuracy, "f1": f1}

In [24]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./checkpoints",
   learning_rate=2e-5,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   num_train_epochs=3,
   weight_decay=0.01,
   save_strategy="epoch",
   seed=42
)


trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_train,
   eval_dataset=tokenized_test,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

In [25]:
trainer.train()

  0%|          | 0/189 [00:00<?, ?it/s]You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
 78%|███████▊  | 147/189 [03:00<00:52,  1.26s/it]

In [None]:
trainer.evaluate()

13it [00:39,  2.42s/it]                      
Downloading builder script: 6.50kB [00:00, 6.49MB/s]                   
14it [00:45,  3.25s/it]


{'eval_loss': 0.698319673538208,
 'eval_accuracy': 0.56,
 'eval_f1': 0.5686274509803921,
 'eval_runtime': 6.8239,
 'eval_samples_per_second': 14.654,
 'eval_steps_per_second': 1.026,
 'epoch': 3.0}

Explanation: to reduce training time, we set max length of an input sequence to 10. This is not a good practice, but helps us test the model quickly.