In [None]:
import os
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

# Disable wandb
os.environ["WANDB_DISABLED"] = "true"
os.environ["NCCL_P2P_DISABLE"] = "1"
os.environ["NCCL_IB_DISABLE"] = "1"
os.environ["NCCL_DEBUG"] = "INFO"


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


dataset = load_dataset("dair-ai/emotion")  # Contains emotional labels (sadness, joy, anger, etc.)

# Load GPT-2 tokenizer
from transformers import RobertaTokenizer, RobertaForSequenceClassification



label_map = {label: i for i, label in enumerate(dataset["train"].features["label"].names)}
num_labels = len(label_map)



tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels=num_labels,  # 7 emotion classes
    pad_token_id=tokenizer.pad_token_id
).to(device)

tokenizer.pad_token = tokenizer.eos_token  # GPT-2 does not have a default pad token

# Tokenize function
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",  # Ensures consistent length
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )

tokenized_datasets = dataset.map(tokenize_function, batched=True)



# Training arguments
training_args = TrainingArguments(
    output_dir="models/",
    evaluation_strategy="epoch",
    num_train_epochs=100,
    per_device_train_batch_size=32,  # Set to 1 due to GPT-2 padding constraints
    per_device_eval_batch_size=32,
    warmup_steps=3,
    weight_decay=0.01,
    logging_dir="logs/",
    report_to=[],
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
)

# Train model
trainer.train()

# Save trained model and tokenizer
model.save_pretrained("models/")
tokenizer.save_pretrained("models/")


  from .autonotebook import tqdm as notebook_tqdm
2025-03-24 23:11:53.939743: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-24 23:11:54.111737: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742857914.192897     764 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742857914.217739     764 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-24 23:11:54.367400: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorF

Using device: cuda


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


02aa490fc410:764:764 [0] NCCL INFO cudaDriverVersion 12020
02aa490fc410:764:764 [0] NCCL INFO Bootstrap : Using eth0:172.17.0.2<0>
02aa490fc410:764:764 [0] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so)
02aa490fc410:764:764 [0] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so
02aa490fc410:764:764 [0] NCCL INFO NET/Plugin: Using internal network plugin.
NCCL version 2.21.5+cuda12.4
02aa490fc410:764:147768 [2] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
02aa490fc410:764:147768 [2] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.2<0>
02aa490fc410:764:147768 [2] NCCL INFO Using non-device net plugin version 0
02aa490fc410:764:147768 [2] NCCL INFO Using network Socket
02aa490fc410:764:147769 [3] NCCL INFO Using non-device net plugin version 0
02aa490fc410:764:147769 [3] NCCL INFO Using network Socket
02aa490fc410:764:147766 [0] NCCL INFO Using non-device net plugin version 0
02



Epoch,Training Loss,Validation Loss
1,No log,0.229838
2,No log,0.176721
3,No log,0.17329
4,0.289400,0.133101
5,0.289400,0.145986
6,0.289400,0.145351
7,0.289400,0.144629
8,0.100400,0.146914
9,0.100400,0.168935
10,0.100400,0.183779




('models/tokenizer_config.json',
 'models/special_tokens_map.json',
 'models/vocab.json',
 'models/merges.txt',
 'models/added_tokens.json')

In [None]:
import numpy as np
from sklearn.metrics import f1_score

# Tokenize test dataset
test_dataset = tokenized_datasets["test"]


predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=-1)  # Get predicted labels
labels = test_dataset["label"]  # True labels


f1_macro = f1_score(labels, preds, average="macro")
print(f"F1 Macro Score: {f1_macro:.4f}")


F1 Macro Score: 0.9049


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

import os
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

# Disable wandb
os.environ["WANDB_DISABLED"] = "true"
os.environ["NCCL_P2P_DISABLE"] = "1"
os.environ["NCCL_IB_DISABLE"] = "1"
os.environ["NCCL_DEBUG"] = "INFO"


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")




model_path = "models/"  # Path where the model was saved
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

unseen_texts = [
    "I'm feeling fantastic today!", 
    "I can't believe this happened. So frustrating!", 
    "This is the worst day of my life."
]


inputs = tokenizer(unseen_texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
inputs = {key: value.to(model.device) for key, value in inputs.items()}  # Move inputs to correct device


model.eval()  # Set model to evaluation mode
with torch.no_grad():
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=1)  # Get label with highest probability
    print(predictions)




Using device: cuda
tensor([1, 3, 0])


In [None]:
import torch
from sklearn.metrics import f1_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification



model_path = "models/"  # Path where the model was saved
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)


new_data = [
    {"text": "I’m scared my credit score will never recover.", "label": "fear"},
    {"text": "It’s unfair how high the debt settlement fees are!", "label": "anger"},
    {"text": "My low credit score makes me feel hopeless.", "label": "sadness"},
    {"text": "I’m so happy my credit score is improving!", "label": "joy"},
    {"text": "What if my credit score drops even more?", "label": "fear"},
    {"text": "Why does debt settlement take so long?", "label": "anger"},
    {"text": "I feel defeated seeing my credit card debt.", "label": "sadness"},
    {"text": "It’s great that I finally paid off my debt!", "label": "joy"},
    {"text": "Lenders charging extra interest disgusts me.", "label": "anger"},
    {"text": "I’m terrified of my debt going to collections.", "label": "fear"},
    {"text": "Credit card fees are way too high!", "label": "anger"},
    {"text": "I feel lost trying to fix my bad credit.", "label": "sadness"},
    {"text": "I’m relieved my loan got approved!", "label": "joy"},
    {"text": "I hate that my interest rate keeps rising.", "label": "disgust"},
    {"text": "I am scared and What if my loan application gets rejected?", "label": "fear"},
    {"text": "Why does fixing credit take so long?", "label": "anger"}
]



texts = [item["text"] for item in new_data]
labels = [item["label"] for item in new_data]


inputs = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
inputs = {key: value.to(model.device) for key, value in inputs.items()}  # Move inputs to correct device


model.eval()  # Set model to evaluation mode
with torch.no_grad():
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=1)  # Get label with highest probability


label_map = {i: label for i, label in enumerate(dataset["train"].features["label"].names)}  # Assuming label_map is defined
predicted_labels = [label_map[pred.item()] for pred in predictions]

print(predicted_labels)
# Calculate F1 score
f1 = f1_score(labels, predicted_labels, average="weighted")  # Use "weighted" for imbalanced classes
print(f"F1 Score: {f1:.4f}")

['fear', 'anger', 'sadness', 'joy', 'fear', 'anger', 'sadness', 'joy', 'anger', 'fear', 'anger', 'sadness', 'joy', 'anger', 'fear', 'anger']
F1 Score: 0.9091
