# Fine-tune for text classification based on an oreilly book

Reference:
* git code https://github.com/alinazhanguwo/oreilly-Natural-Language-Processing-with-Transformers/blob/main/02_classification.ipynb
* my note: https://docs.google.com/document/d/1BXdJe_r6m93m13R79r6WoEPfSk0uIzVd2puRdcT_mHk/edit#



In [1]:
! pip install transformers datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Download and examine a dataset

In [2]:
from datasets import load_dataset

emotions = load_dataset("emotion")

Using custom data configuration default
Reusing dataset emotion (/root/.cache/huggingface/datasets/emotion/default/0.0.0/348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705)


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
emotions

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [4]:
train_ds = emotions["train"]
train_ds

Dataset({
    features: ['text', 'label'],
    num_rows: 16000
})

In [5]:
import pprint

pprint.pprint(train_ds[:5])

{'label': [0, 0, 3, 2, 3],
 'text': ['i didnt feel humiliated',
          'i can go from feeling so hopeless to so damned hopeful just from '
          'being around someone who cares and is awake',
          'im grabbing a minute to post i feel greedy wrong',
          'i am ever feeling nostalgic about the fireplace i will know that it '
          'is still on the property',
          'i am feeling grouchy']}


In [6]:
print(train_ds.features)

{'text': Value(dtype='string', id=None), 'label': ClassLabel(num_classes=6, names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'], id=None)}


# From Datasets to DataFrames

In [7]:
import pandas as pd

emotions.set_format(type="pandas")
df = emotions["train"][:]
df.head()

Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,3


##  use the int2str() method of the label feature to create a new column with the corresponding label names

In [8]:
def label_int2str(row):
    return emotions["train"].features["label"].int2str(row)

df["label_name"] = df["label"].apply(label_int2str)
df.head()

Unnamed: 0,text,label,label_name
0,i didnt feel humiliated,0,sadness
1,i can go from feeling so hopeless to so damned...,0,sadness
2,im grabbing a minute to post i feel greedy wrong,3,anger
3,i am ever feeling nostalgic about the fireplac...,2,love
4,i am feeling grouchy,3,anger


## Tokenization: adding cols ['input_ids', 'attention_mask']

In [9]:
from transformers import AutoTokenizer

model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [10]:
text = "I am a firefly."

encoded_text = tokenizer(text)
print(encoded_text)

{'input_ids': [101, 1045, 2572, 1037, 2543, 14151, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}


In [11]:
tokens = tokenizer.convert_ids_to_tokens(encoded_text.input_ids)
print(tokens)

['[CLS]', 'i', 'am', 'a', 'fire', '##fly', '.', '[SEP]']


In [12]:
print(tokenizer.convert_tokens_to_string(tokens))

[CLS] i am a firefly. [SEP]


# Tokenizing the Whole Dataset
## Reset the output format of dataset since don't need the DataFrame format anymore

In [13]:
emotions.reset_format()

In [14]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

In [15]:
print(tokenize(emotions["train"][:2]))

{'input_ids': [[101, 1045, 2134, 2102, 2514, 26608, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 1045, 2064, 2175, 2013, 3110, 2061, 20625, 2000, 2061, 9636, 17772, 2074, 2013, 2108, 2105, 2619, 2040, 14977, 1998, 2003, 8300, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}


In [16]:
emotions_encoded = emotions.map(tokenize, batched=True, batch_size=None)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [17]:
print(emotions_encoded["train"].column_names)

['text', 'label', 'input_ids', 'attention_mask']


# Load a pretrained model

- The first thing we need is a pretrained DistilBERT model like the one we used in the feature-based approach. 

- The only slight modification is that we **use the AutoModelForSequenceClassification model instead of AutoModel**. 

- The difference is that the AutoModelForSequenceClassification model has a classification head on top of the pretrained model outputs, which can be easily trained with the base model. 

- We just need to specify how many labels the model has to predict (six in our case), since this dictates the number of outputs the classification head has.

In [18]:
from transformers import AutoModelForSequenceClassification

num_labels = 6
model = (AutoModelForSequenceClassification
         .from_pretrained(model_ckpt, num_labels=num_labels))

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'classifi

# Defining the performance metrics

In [19]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

# Define training hyperparameters 

In [20]:
from transformers import Trainer, TrainingArguments

batch_size = 64
logging_steps = len(emotions_encoded["train"]) // batch_size
model_name = f"{model_ckpt}-finetuned-emotion"
training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=1,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  disable_tqdm=False,
                                  #logging_steps=logging_steps,
                                  #push_to_hub=True, 
                                  log_level="error")

# Create Trainer

In [21]:
from transformers import Trainer

trainer = Trainer(model=model, args=training_args, 
                  compute_metrics=compute_metrics,
                  train_dataset=emotions_encoded["train"],
                  eval_dataset=emotions_encoded["validation"],
                  tokenizer=tokenizer)


In [22]:
trainer.train();



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.546846,0.835,0.820982


# Get the predictions on the validation set

In [23]:
preds_output = trainer.predict(emotions_encoded["validation"])


In [24]:
preds_output.metrics

{'test_accuracy': 0.835,
 'test_f1': 0.8209816328906846,
 'test_loss': 0.546846330165863,
 'test_runtime': 4.3239,
 'test_samples_per_second': 462.543,
 'test_steps_per_second': 7.401}

## Check raw predictions
It also contains the raw predictions for each class. We can decode the predictions greedily using np.argmax(). This yields the predicted labels and has the same format as the labels returned by the Scikit-Learn models in the feature-based approach:

In [25]:
import numpy as np
import matplotlib.pyplot as plt

y_preds = np.argmax(preds_output.predictions, axis=1)

In [26]:
y_preds

array([0, 0, 1, ..., 1, 1, 1])

In [27]:
len(y_preds)

2000

# Create confusion matrix for evaluation (todo)

In [28]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

def plot_confusion_matrix(y_preds, y_true, labels):
    cm = confusion_matrix(y_true, y_preds, normalize="true")
    fig, ax = plt.subplots(figsize=(6, 6))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    disp.plot(cmap="Blues", values_format=".2f", ax=ax, colorbar=False)
    plt.title("Normalized confusion matrix")
    plt.show()
    

# Visualize training dataset (todo)

# Save trained model


In [29]:
trainer.save_model ("gdrive/My Drive/LOCATION")

# Load a saved model in local for prediction

In [30]:
model = AutoModelForSequenceClassification.from_pretrained("gdrive/My Drive/LOCATION", local_files_only=True)
trainer = Trainer(model=model)

y = trainer.predict(emotions_encoded["test"])

y.metrics

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 2000
  Batch size = 8


{'test_loss': 0.5306264758110046,
 'test_runtime': 4.451,
 'test_samples_per_second': 449.332,
 'test_steps_per_second': 56.167}

In [31]:
y_preds = np.argmax(y.predictions, axis=1)
y_preds

array([0, 0, 0, ..., 1, 1, 4])

In [34]:
print(emotions["test"][0])

{'text': 'im feeling rather rotten so im not very ambitious right now', 'label': 0}


In [35]:
y_preds[0]

0

In [36]:
print(emotions["test"][1500])

{'text': 'im feeling just a little proud', 'label': 1}


In [37]:
y_preds[1500]

1