In [14]:
from sklearn.model_selection import train_test_split

import pandas as pd
import torch
from datasets import load_dataset
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoConfig,
)
from huggingface_hub import HfFolder, notebook_login

from datasets import load_dataset, Dataset, ClassLabel, DatasetDict

from sklearn.metrics import accuracy_score, f1_score, plot_confusion_matrix

# Load Datasets

## Call Centre dataset

In [18]:
# Load dataset
data = pd.read_csv('/mnt/disk2/arshia.yousefinezhad/emotion_detection/data/preprocess_labelencoding_data.csv')


# Split dataset to train test validation
call_data, data_val = train_test_split(data,test_size=0.15,  random_state=42 , stratify=data.emotion)
data_train, data_test = train_test_split(call_data,test_size=0.1 ,  random_state=42 , stratify=call_data.emotion)

# convert in to huggingface dataset
train_dataset = Dataset.from_pandas(data_train)
train_dataset = train_dataset.remove_columns(["__index_level_0__"])

val_dataset = Dataset.from_pandas(data_val)
val_dataset = val_dataset.remove_columns(["__index_level_0__"])

test_dataset = Dataset.from_pandas(data_test)
test_dataset = test_dataset.remove_columns(["__index_level_0__"])


***Resources***
1.   [Fine-tuning RoBERTa for Topic Classification with Hugging Face Transformers and Datasets Library](https://medium.com/@achillesmoraites/fine-tuning-roberta-for-topic-classification-with-hugging-face-transformers-and-datasets-library-c6f8432d0820)



**Models**
1.   Xlm-Roberta
2.   Roberta




# Transformer models

## Datasets

In [19]:
# convert dataset to pandas dataset
df_train = train_dataset.to_pandas()
df_val = val_dataset.to_pandas()
df_test = test_dataset.to_pandas()

### Preparing labels

In [None]:
#  unique labels
unique_labels_train = list(sorted(df_train['emotion'].unique()))

# ClassLabels emotion
class_label_feature = ClassLabel(names=unique_labels_train)


def label_str_to_int_call(example):
    example['emotion'] = class_label_feature.str2int(example['emotion'])
    return example



# calls dataset
train_dataset = train_dataset.map(label_str_to_int_call)
train_dataset = train_dataset.cast_column('emotion', class_label_feature)

val_dataset = val_dataset.map(label_str_to_int_call)
val_dataset = val_dataset.cast_column('emotion', class_label_feature)

test_dataset = test_dataset.map(label_str_to_int_call)
test_dataset = test_dataset.cast_column('emotion', class_label_feature)


# Models

### some initializations

### Configoration

In [21]:
df = df_train.copy()

roberta_id = "roberta-base"

label_list = list(sorted(df['emotion'].unique()))

label2id = {label: i for i, label in enumerate(label_list)}
id2label = {v: k for k, v in label2id.items()}

# Update the model's configuration with the id2label mapping
config = AutoConfig.from_pretrained(roberta_id)
config.update({"id2label": id2label})

## Roberta

### Tokenization

In [None]:
# Preprocessing
tokenizer = RobertaTokenizerFast.from_pretrained(roberta_id)

# This function tokenizes the input text using the RoBERTa tokenizer.
# It applies padding and truncation to ensure that all sequences have the same length (256 tokens).
# https://huggingface.co/learn/nlp-course/chapter5/3?fw=pt#the-map-methods-superpowers
def tokenize(batch):
    return tokenizer(batch["combined_text"], padding=True, truncation=True, max_length=256)

train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
val_dataset = val_dataset.map(tokenize, batched=True, batch_size=len(val_dataset))
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset))

train_dataset = train_dataset.remove_columns(["combined_text"])
val_dataset = val_dataset.remove_columns(["combined_text"])
test_dataset = test_dataset.remove_columns(["combined_text"])

train_dataset = train_dataset.rename_column("emotion", "labels")
val_dataset = val_dataset.rename_column("emotion", "labels")
test_dataset = test_dataset.rename_column("emotion", "labels")


# Set dataset format
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

### Defining Roberta model

In [None]:
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [25]:
train_dataset.features

{'labels': ClassLabel(names=[' عاشقانه و خوشحال', 'عصبانی', 'غمگین و مضطرب', 'معمولی', 'هیجانی و متعجب'], id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [26]:
# Model
model = RobertaForSequenceClassification.from_pretrained(roberta_id, num_labels=5)

# TrainingArguments
training_args = TrainingArguments(
    output_dir="emotion_detection_roberta_base",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=5e-5,
    weight_decay=0.01,
    evaluation_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True,
    report_to=['tensorboard'],
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# Fine-tune the model
trainer.train()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
500,1.1161,1.103848,0.643852




TrainOutput(global_step=825, training_loss=1.1114401337594697, metrics={'train_runtime': 519.9398, 'train_samples_per_second': 50.712, 'train_steps_per_second': 1.587, 'total_flos': 3468818031174144.0, 'train_loss': 1.1114401337594697, 'epoch': 3.0})

In [27]:
trainer.evaluate()

{'eval_loss': 1.1038479804992676,
 'eval_accuracy': 0.6438515081206496,
 'eval_runtime': 10.6511,
 'eval_samples_per_second': 161.861,
 'eval_steps_per_second': 5.07,
 'epoch': 3.0}

In [29]:
trainer.predict(test_dataset)



PredictionOutput(predictions=array([[-0.73139083, -0.17399277,  0.00975015,  1.5033205 , -1.1484352 ],
       [-0.73140115, -0.17398909,  0.00975839,  1.5033292 , -1.1484486 ],
       [-0.73138946, -0.17399181,  0.00974631,  1.5033181 , -1.1484134 ],
       ...,
       [-0.73139757, -0.17399354,  0.00975172,  1.5033239 , -1.1484367 ],
       [-0.73140514, -0.17398879,  0.00976322,  1.5033376 , -1.1484594 ],
       [-0.7313976 , -0.17398432,  0.0097602 ,  1.5033306 , -1.1484532 ]],
      dtype=float32), label_ids=array([2, 3, 3, 3, 0, 3, 3, 1, 3, 3, 3, 3, 1, 3, 3, 2, 3, 3, 3, 3, 2, 3,
       3, 3, 0, 3, 3, 3, 3, 1, 3, 3, 3, 2, 4, 3, 3, 3, 4, 1, 2, 2, 3, 1,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 2, 3, 3, 0, 2, 3, 4,
       4, 3, 1, 2, 3, 1, 3, 1, 2, 3, 3, 3, 3, 0, 3, 1, 3, 2, 3, 3, 3, 1,
       0, 4, 3, 3, 3, 2, 0, 3, 1, 1, 3, 2, 3, 3, 3, 1, 1, 1, 1, 4, 3, 3,
       3, 3, 0, 3, 3, 3, 3, 3, 2, 3, 1, 3, 3, 2, 1, 3, 3, 3, 3, 0, 3, 3,
       3, 2, 3, 3, 3, 3, 3, 0, 3, 3, 1, 1, 2

In [30]:
test_dataset

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 977
})

In [31]:
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

In [None]:
def extract_hidden_states(batch):
    # Place model inputs on the GPU
    inputs = {k:v.to(device) for k,v in batch.items()
                if k in tokenizer.model_input_names}
    # Extract last hidden states
    with torch.no_grad():
    last_hidden_state = model(**inputs).last_hidden_state
    # Return vector for [CLS] token
    return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}

train_dataset.map(extract_hidden_states, batched=True)

In [None]:

preds_output = trainer.predict(test_dataset)
y_preds = np.argmax(preds_output.predictions, axis=1)
plot_confusion_matrix(y_preds, y_valid, labels)