In [1]:
# First of all, imports
from transformers import TFDistilBertForSequenceClassification, DistilBertTokenizer, DataCollatorWithPadding, DistilBertConfig
from datasets import Dataset
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import f1_score
from tensorflow.keras.metrics import AUC
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import Adam

2025-03-19 15:42:20.717234: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-19 15:42:20.733782: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742398940.749786  234082 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742398940.754585  234082 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1742398940.770280  234082 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [2]:
config = DistilBertConfig.from_pretrained('distilbert-base-uncased', num_labels=7)

model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', config=config)
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

2025-03-19 15:42:24.136071: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:47] Overriding orig_value setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
I0000 00:00:1742398944.136399  234082 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 29173 MB memory:  -> device: 0, name: NVIDIA RTX 6000 Ada Generation, pci bus id: 0000:81:00.0, compute capability: 8.9
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializi

In [7]:
model.compile(
    optimizer='adam',
    loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

In [8]:
raw_df = pd.read_csv('../datasets/dataset_v1_GoodOne.csv', engine='python')
raw_df = raw_df.rename(columns={'text': 'Translation', 'dominant_emotion': 'Corrected_Emotion'})
raw_df.columns

Index(['Translation', 'Corrected_Emotion', 'POS_Tags', 'TF_IDF',
       'Sentiment_Score', 'Pretrained_Embeddings', 'Custom_Embeddings',
       'Cleaned_Text'],
      dtype='object')

In [9]:
df = raw_df[['Translation', 'Corrected_Emotion']]

print(df['Corrected_Emotion'].value_counts())

label_encoder = LabelEncoder()

df["Corrected_Emotion"] = label_encoder.fit_transform(df["Corrected_Emotion"])

train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['Corrected_Emotion'])

train_df = Dataset.from_pandas(train_df)
val_df = Dataset.from_pandas(val_df)

# Define tokenization function
def tokenizer_function(examples):
    return tokenizer(
        examples["Translation"],
        truncation=True,
        padding='max_length',
        max_length=128
    )

# Tokenize dataset
train_tokenized_df = train_df.map(tokenizer_function, batched=True)
val_tokenized_df = val_df.map(tokenizer_function, batched=True)

# Convert dataset to TensorFlow format
def convert_to_tf_dataset(dataset):
    input_ids = np.array(dataset["input_ids"], dtype=np.int32)
    attention_mask = np.array(dataset["attention_mask"], dtype=np.int32)
    labels = to_categorical(np.array(dataset["Corrected_Emotion"], dtype=np.int32), num_classes=7)

    features = {
        "input_ids": tf.convert_to_tensor(input_ids),
        "attention_mask": tf.convert_to_tensor(attention_mask),
    }
    labels = tf.convert_to_tensor(labels, dtype=tf.float32)

    return tf.data.Dataset.from_tensor_slices((features, labels)).batch(16).prefetch(tf.data.experimental.AUTOTUNE)

train_dataset = convert_to_tf_dataset(train_tokenized_df)
val_dataset = convert_to_tf_dataset(val_tokenized_df)

Corrected_Emotion
anger        66732
sadness      53541
fear         49402
happiness    45713
neutral      18423
surprise     17752
disgust       1085
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Corrected_Emotion"] = label_encoder.fit_transform(df["Corrected_Emotion"])


Map:   0%|          | 0/202118 [00:00<?, ? examples/s]

Map:   0%|          | 0/50530 [00:00<?, ? examples/s]

In [11]:
total_samples = sum([66732, 53541, 49402, 45713, 18423, 17752, 1085])
class_counts = 7
class_weights = {}

for i, count in enumerate(class_counts):
    # More weight for less frequent classes
    class_weights[i] = total_samples / (len(class_counts) * count)

print("Class weights:", class_weights)

Class weights: {0: 0.5408585300691037, 1: 0.6741108949883534, 2: 0.73058927631617, 3: 0.7895472060151691, 4: 1.9591039151371346, 5: 2.0331552179231314, 6: 33.26504279131007}


In [13]:
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=15,
    class_weight=class_weights,
)

Epoch 1/15


I0000 00:00:1742399537.726370  234739 service.cc:152] XLA service 0x7f240098e540 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1742399537.726400  234739 service.cc:160]   StreamExecutor device (0): NVIDIA RTX 6000 Ada Generation, Compute Capability 8.9
2025-03-19 15:52:17.732563: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1742399538.010705  234739 cuda_dnn.cc:529] Loaded cuDNN version 90300
I0000 00:00:1742399538.195070  234739 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/15
Epoch 3/15

KeyboardInterrupt: 