<a href="https://colab.research.google.com/github/alex-smith-uwec/NLP_Spring2025/blob/main/FineTune_Basic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#If you plan to get actually get to the fine tuning stage at the bottom of this notebook, then before you begin you should change the runtime to GPU. If you are just noodling around with stuff before that, then just leave things with CPU.

The content of this notebook is adapted from the video below by Lewis Tunstall.


In [None]:
# @title

from IPython.display import HTML

HTML('<iframe width="560" height="315" src="https://www.youtube.com/embed/u--UVvH-LIQ?si=EqRlGOizWG7tgF7b" frameborder="0" allowfullscreen></iframe>')


In [None]:
# !pip install 'accelerate>=0.21.0' -U -q

In [None]:
!pip install 'transformers[torch]' -U -q
# !pip install 'transformers[tensorflow]'


In [None]:
import transformers
print(transformers.__version__)


In [None]:
!pip install datasets -q

In [None]:
from datasets import load_dataset

from transformers import AutoTokenizer
from transformers import TrainingArguments
from transformers import Trainer
from transformers import AutoModelForSequenceClassification

import torch
from torch import nn

import random
from sklearn.metrics import f1_score

# The Emotion Dataset

 [The emotion dataset](https://huggingface.co/datasets/dair-ai/emotion)
 dataset card on huggingface

In [None]:
emotion_dataset=load_dataset("emotion",trust_remote_code=True)
emotion_dataset

In [None]:
random_integer = random.randint(0, 16000)
random_integer

emotion_dataset["train"][random_integer]

In [None]:
emotion_df=emotion_dataset["train"].to_pandas()
emotion_df[10:15]

In [None]:
features=emotion_dataset["train"].features
features

In [None]:
features["label"].int2str(3)

In [None]:
id2label={idx:features["label"].int2str(idx) for idx in range(6)}
id2label

In [None]:
label2id={v:k for k,v in id2label.items()}
label2id

In [None]:
emotion_df["label"].value_counts(normalize=True).sort_index()
#See 5:32 minute mark: distribution of values is very uneven!

[sklearn F1 score](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html)

For evaluation of the model, we will use an F-score, which is often used for imbalanced situtations.

#Tokenize everything

#Pretrained model checkpoint [huggingface card](https://huggingface.co/microsoft/MiniLM-L12-H384-uncased)

In [None]:
model_ckpt="microsoft/MiniLM-L12-H384-uncased"
#ckpt is "checkpoint"

tokenizer=AutoTokenizer.from_pretrained(model_ckpt)

In [None]:
# sample="I did not go running"
sample=emotion_dataset["train"]["text"][0]

encoded_input = tokenizer(sample, return_tensors='pt')

# Print the token IDs (numbers)
print(encoded_input['input_ids'][0])

# Convert the token IDs back to tokens (subwords) and print them
tokens = tokenizer.convert_ids_to_tokens(encoded_input['input_ids'][0])
print(tokens)

# Print the original text for reference
print(sample)
encoded_input

In [None]:
def tokenize_text(examples):
  return tokenizer(examples["text"], truncation=True, max_length=512)

In [None]:
emotion_dataset=emotion_dataset.map(tokenize_text, batched=True)
emotion_dataset

# Dealing with the imbalanced classes

In [None]:
emotion_df["label"].value_counts()

In [None]:

class_weights=(1-(emotion_df["label"].value_counts().sort_index()/len(emotion_df))).values
print(class_weights)

class_weights=torch.from_numpy(class_weights)
print(class_weights)

class_weights=class_weights.float()
print(class_weights)

#.to("cuda") if GPU
class_weights=class_weights#.to("cuda")
print(class_weights)

In [None]:
##See 12:57 minute mark of video
emotion_dataset=emotion_dataset.rename_column("label","labels")

In [None]:
emotion_dataset

In [None]:
batch_size = 64
# Assuming `emotion_dataset["train"]` is defined elsewhere and accessible
logging_steps = len(emotion_dataset["train"]) // batch_size
output_dir = "minilm-finetuned-emotion"

training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=5,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    weight_decay=0.01,
    # evaluation_strategy="epoch", (seems to have been replaced in newer versions of transformers with next line)
    eval_steps=logging_steps,
    logging_steps=logging_steps,
    fp16=True,  # Changed to False for CPU compatibility
    push_to_hub=True,  # Set based on your needs
    report_to="none",  # This disables integration with W&B
)


**The class Trainer has a method named compute_loss.
But we are going to define a subclass of Trainer so that we can override compute_loss**

In [None]:
class WeightedLossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):  # Add num_items_in_batch
        # Move class_weights to the same device as the model
        device = next(model.parameters()).device  # Get the device of the model
        self.class_weights = class_weights.to(device)  # Move class_weights to the device

        # Feed inputs to model and extract logits
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # Extract labels
        labels = inputs.get("labels")
        # Define loss function with class weights
        loss_func = nn.CrossEntropyLoss(weight=self.class_weights)  # Use self.class_weights
        # Compute loss
        loss = loss_func(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [None]:
model=AutoModelForSequenceClassification.from_pretrained(model_ckpt,
                                                      num_labels=6,
                                                      id2label=id2label,
                                                      label2id=label2id)

In [None]:
from sklearn.metrics import accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    accuracy = accuracy_score(labels, preds)  # Calculate accuracy
    return {"f1": f1, "accuracy": accuracy}


In [None]:
trainer=WeightedLossTrainer(model=model,
                          args=training_args,
                          compute_metrics=compute_metrics,
                          train_dataset=emotion_dataset["train"],
                          eval_dataset=emotion_dataset["validation"],
                          tokenizer=tokenizer)

In [None]:
trainer.train()

In [None]:
test_results = trainer.evaluate(emotion_dataset["test"])
print(test_results)


# Inference

In [None]:
from transformers import pipeline

In [None]:
model_ckpt="alex-smith/minilm-finetuned-emotion"

In [None]:
pipe=pipeline("text-classification",model=model_ckpt)

In [None]:
pipe("I really dislike integration by parts if we have to do it twice!")

In [None]:

# Select a sample text from the training split
train_sample=random.randint(0, 16000)
test_sample=random.randint(0,2000)

train_text = emotion_dataset["train"]["text"][train_sample]
train_label=emotion_dataset["train"]["labels"][train_sample]

# Select a sample text from the testing split
test_text = emotion_dataset["test"]["text"][test_sample]
test_label=emotion_dataset["test"]["labels"][test_sample]

# Classify the training text
train_result = pipe(train_text)
print(f"Training text classification result:\n {train_result}, actual label: {id2label[train_label]}")
print(f"train text: {train_text}")

print(f"\n")
# Classify the testing text
test_result = pipe(test_text)
print(f"Testing text classification result:\n {test_result}, actual label: {id2label[test_label]}")
print(f"test text: {test_text}")