In [44]:
# First of all, imports
from transformers import TFDistilBertForSequenceClassification, DistilBertTokenizer, DataCollatorWithPadding, DistilBertConfig, create_optimizer
from datasets import Dataset
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from tensorflow.keras.metrics import AUC
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import Adam
from sklearn.utils.class_weight import compute_class_weight

In [None]:
# Setting up the model and tokenizer
config = DistilBertConfig.from_pretrained('distilbert-base-uncased', num_labels=7)

model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', config=config)
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [None]:
# Starting the data processing before training
raw_df = pd.read_csv('../datasets/dataset_v1_GoodOne.csv', engine='python')

# Renaming the columns in order to work on other versions of the dataset
raw_df = raw_df.rename(columns={'text': 'Translation', 'dominant_emotion': 'Corrected_Emotion'})
raw_df.columns

Index(['Translation', 'Corrected_Emotion', 'POS_Tags', 'TF_IDF',
       'Sentiment_Score', 'Pretrained_Embeddings', 'Custom_Embeddings',
       'Cleaned_Text'],
      dtype='object')

In [None]:
# Selecting the columns we need for training
df = raw_df[['Translation', 'Corrected_Emotion']]

# Encoding the labels
label_encoder = LabelEncoder()
label_encoder.fit(df["Corrected_Emotion"])
df["Encoded_Emotion"] = label_encoder.transform(df["Corrected_Emotion"])

# I am performing custom class weights for a better performance during training
# The class weights are calculated based on the frequency of each class in the training set
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(df["Encoded_Emotion"]),  # Using encoded labels (numerical values)
    y=df["Encoded_Emotion"]  # Encode the 'Corrected_Emotion' column
)

class_weights = dict(enumerate(class_weights))

# Dropping the unencoded column because we don't need it anymore
df = df.drop(["Corrected_Emotion"], axis=1)

# 80% training, 20% validation split
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['Encoded_Emotion'])

# Transforming the DataFrame into a Dataset object from Hugging Face
train_df = Dataset.from_pandas(train_df)
val_df = Dataset.from_pandas(val_df)

# Define tokenization function
def tokenizer_function(examples):
    return tokenizer(
        examples["Translation"],
        truncation=True,
        padding='max_length',
        max_length=128
    )

# Tokenize dataset
train_tokenized_df = train_df.map(tokenizer_function, batched=True)
val_tokenized_df = val_df.map(tokenizer_function, batched=True)

# Convert dataset to TensorFlow format
def convert_to_tf_dataset(dataset):
    input_ids = np.array(dataset["input_ids"], dtype=np.int32)
    attention_mask = np.array(dataset["attention_mask"], dtype=np.int32)
    labels = to_categorical(np.array(dataset["Encoded_Emotion"], dtype=np.int32), num_classes=7)

    features = {
        "input_ids": tf.convert_to_tensor(input_ids),
        "attention_mask": tf.convert_to_tensor(attention_mask),
    }
    labels = tf.convert_to_tensor(labels, dtype=tf.float32)

    return tf.data.Dataset.from_tensor_slices((features, labels)).batch(16).prefetch(tf.data.experimental.AUTOTUNE)

# I need to convert the dataset to TensorFlow format for training
train_dataset = convert_to_tf_dataset(train_tokenized_df)
val_dataset = convert_to_tf_dataset(val_tokenized_df)

# Define the optimizer and learning rate schedule
num_train_steps = len(train_dataset) * 10  # epochs
optimizer, lr_schedule = create_optimizer(
    init_lr=1e-5,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.05,
    num_warmup_steps=num_train_steps * 0.1,  # 10% warmup
)

# Compile the model with the optimizer and loss function
model.compile(
    optimizer=optimizer,
    loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Encoded_Emotion"] = label_encoder.transform(df["Corrected_Emotion"])


Map:   0%|          | 0/202118 [00:00<?, ? examples/s]

Map:   0%|          | 0/50530 [00:00<?, ? examples/s]

In [None]:
# Train the model
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=3,
    class_weight=class_weights
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [None]:
### TESTING THE MODEL ###
# Same steps for preprocessing the test set
# Loading the test set
test_set = pd.read_csv('../datasets/group 11_url1.csv')

test_set = test_set[['Translation', 'Emotion']]

test_set = test_set.dropna(subset=['Emotion'])
print(test_set['Emotion'].isnull().sum())

# Mapping the emotions to the main emotions
# This mapping is based on the diagram from the GitHub Pages
emotion_mapping = {
    'neutral': 'neutral',
    'disgust': 'disgust',
    'surprise': 'surprise',
    'curiosity': 'surprise',
    'excitement': 'happiness',
    'fear': 'fear',
    'optimism': 'happiness',
    'nervousness': 'fear',
    'confusion': 'surprise',
    'annoyance': 'anger',
    'caring': 'happiness',
    'disapproval': 'anger',
    'approval': 'happiness',
    'disappointment': 'sadness',
    'admiration': 'happiness',
    'desire': 'happiness',
    'amusement': 'happiness',
    'sadness': 'sadness',
    'pride': 'happiness',
    'gratitude': 'happiness',
    'realization': 'happiness',
    'relief': 'happiness',
    'remorse': 'sadness',
    'joy': 'happiness',
    'love': 'happiness',
    'anger': 'anger',
}

test_set['main_emotion'] = test_set['Emotion'].apply(lambda x: emotion_mapping.get(x, 'Other'))

test_set = test_set[['main_emotion', 'Translation']]

test_set['main_emotion'].value_counts()

0


main_emotion
neutral      328
surprise     318
happiness    182
fear          54
anger         34
sadness       24
disgust        1
Name: count, dtype: int64

In [None]:
# Splitting the test set into features and labels
X_test = test_set['Translation'].tolist()

# Tokenize X_test
X_test_tokenized = tokenizer(X_test,
                             truncation=True,
                             padding='max_length',
                             max_length=128,
                             return_tensors='tf')

# Prepare y_test (the ground truth labels)
y_test = test_set['main_emotion']
label_encoder = LabelEncoder()
y_test = label_encoder.fit_transform(y_test)  # Transform labels into numeric format

# Make predictions (logits are returned here)
y_pred = model.predict(X_test_tokenized)

# Extract logits from the model's output
logits = y_pred.logits

# Convert logits to predicted class labels (by selecting the class with the highest logit)
y_pred_classes = np.argmax(logits, axis=-1)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred_classes)
precision = precision_score(y_test, y_pred_classes, average='weighted')
recall = recall_score(y_test, y_pred_classes, average='weighted')
f1 = f1_score(y_test, y_pred_classes, average='weighted')

# Output the metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Classification Report
report = classification_report(y_test, y_pred_classes, target_names=label_encoder.classes_)
print("\nClassification Report:")
print(report)

Accuracy: 0.3486
Precision: 0.6726
Recall: 0.3486
F1 Score: 0.2038

Classification Report:
              precision    recall  f1-score   support

       anger       0.16      0.21      0.18        34
     disgust       0.00      0.00      0.00         1
        fear       0.13      0.09      0.11        54
   happiness       1.00      0.01      0.01       182
     neutral       0.37      0.95      0.53       328
     sadness       0.00      0.00      0.00        24
    surprise       1.00      0.01      0.01       318

    accuracy                           0.35       941
   macro avg       0.38      0.18      0.12       941
weighted avg       0.67      0.35      0.20       941



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Saving the model
model.save_pretrained('./dbert')