# Libraries

In [1]:
import os
import random
import numpy as np
import pandas as pd
import torch
import transformers
from datasets import load_dataset

In [2]:
seed = 42
lang = 'sun'

hf_data_id = 'alxxtexxr/SemEval2025-Task11-Dataset'
hf_data_config = 'track_a_sun_70_15_15_stratify_v2'

In [3]:
def set_seed(seed):
    # Set random seed for NumPy
    np.random.seed(seed)

    # Set random seed for Torch
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if using multi-GPU
    torch.backends.cudnn.deterministic = True  # Ensures deterministic results
    torch.backends.cudnn.benchmark = False  # Avoids non-deterministic algorithms

    # Set random seed for Transformers
    transformers.set_seed(seed)

    # Optionally set random seed for sklearn and Python's own random module
    random.seed(seed)

    # Set random seed for os
    os.environ['PYTHONHASHSEED'] = str(seed)

    print(f"Random seed set to: {seed}")

set_seed(seed)

Random seed set to: 42


# Data

## Load Data

In [4]:
datasets = load_dataset(hf_data_id, hf_data_config)

cols = list(datasets['train'].features)
emotion_cols = [col for col in cols if col not in ['Unnamed: 0', 'text', 'emotion']]
splits = [*datasets.keys()]

print("Splits:", splits)
print("Data columns:", cols)
print("Emotions columns:", emotion_cols)

Splits: ['train', 'val', 'test']
Data columns: ['text', 'emotion', 'marah', 'jijik', 'takut', 'senang', 'sedih', 'terkejut', 'biasa']
Emotions columns: ['marah', 'jijik', 'takut', 'senang', 'sedih', 'terkejut', 'biasa']


In [5]:
df = {split: pd.DataFrame(datasets[split]) for split in splits}
df_all = pd.concat(df.values())
df_all.head()

Unnamed: 0,text,emotion,marah,jijik,takut,senang,sedih,terkejut,biasa
0,Aya randa ker nguseup Pantun sunda meuni reuseup,senang,0,0,0,1,0,0,0
1,pastina ath mang ku abdi shere ken knu grup + SW,biasa,0,0,0,0,0,0,1
2,Mang Fiksi téh urang Majalengka ogé?,senang,0,0,0,1,0,0,0
3,"mang dana ,uing can ngopi, kumaha ieu?",senang,0,0,0,1,0,0,0
4,Sedih Mang Ai kudu D Caritakeun Mah😄🙏,"senang, sedih",0,0,0,1,1,0,0


## EDA

In [6]:
print("Training dataset size:", len(df_all))

Training dataset size: 924


In [7]:
print("Emotion combinations distribution:")
df_all['emotion'].value_counts()

Emotion combinations distribution:


emotion
senang                                  434
senang, terkejut                        131
sedih                                    82
biasa                                    43
senang, sedih                            42
terkejut                                 16
sedih, terkejut                          16
marah, jijik                             15
senang, sedih, terkejut                  11
takut, sedih                             11
jijik, senang                            10
marah, sedih                              9
marah, jijik, terkejut                    8
marah                                     8
marah, sedih, terkejut                    8
takut, senang                             7
marah, jijik, senang                      6
takut, senang, terkejut                   6
marah, senang, terkejut                   6
takut                                     5
jijik, sedih                              5
marah, jijik, sedih                       5
takut, sedih, terkejut  

In [8]:
# for emotion_col in emotion_cols:
#     comb_emotion_total = 0
#     for single_emotion, single_emotion_total in df['train'][df['train'][emotion_col] == 1]['emotion'].value_counts().items():
#         if single_emotion == emotion_col:
#             print(f"{single_emotion}:", single_emotion_total)
#         else:
#             comb_emotion_total += single_emotion_total
#     print(f"{emotion_col}~:", comb_emotion_total)
#     print()

In [11]:
def create_num_emotions_col(row):
    return sum(row[emotion_cols].tolist())

df_all.apply(create_num_emotions_col, axis=1)
df_all['num_emotions'] = df_all.apply(create_num_emotions_col, axis=1)
df_all.head()

Unnamed: 0,text,emotion,marah,jijik,takut,senang,sedih,terkejut,biasa,num_emotions
0,Aya randa ker nguseup Pantun sunda meuni reuseup,senang,0,0,0,1,0,0,0,1
1,pastina ath mang ku abdi shere ken knu grup + SW,biasa,0,0,0,0,0,0,1,1
2,Mang Fiksi téh urang Majalengka ogé?,senang,0,0,0,1,0,0,0,1
3,"mang dana ,uing can ngopi, kumaha ieu?",senang,0,0,0,1,0,0,0,1
4,Sedih Mang Ai kudu D Caritakeun Mah😄🙏,"senang, sedih",0,0,0,1,1,0,0,2


In [61]:
single_emotion_totals = [len(df_all[(df_all[emotion_col] == 1) & (df_all['num_emotions'] == 1)]) for emotion_col in emotion_cols]
comb_emotion_totals = [len(df_all[(df_all[emotion_col] == 1) & (df_all['num_emotions'] > 1)]) for emotion_col in emotion_cols]


for emotion, single_emotion_total, comb_emotion_total in zip(emotion_cols, single_emotion_totals, comb_emotion_totals):
    diff_from_max_single = max(comb_emotion_totals) - single_emotion_total
    diff_from_max_comb = max(comb_emotion_totals) - comb_emotion_total
    ratio = single_emotion_total / (comb_emotion_total if comb_emotion_total else 1) * 100

    print(f"{emotion}:", single_emotion_total, '\t', '-> diff. from max.:', diff_from_max_single)
    print(f"{emotion}~:", comb_emotion_total, '\t', '-> diff. from max.:', diff_from_max_comb)
    print("----------------")
    print(f"ratio: {ratio:.3f}%")
    print()

marah: 8 	 -> diff. from max.: 230
marah~: 76 	 -> diff. from max.: 162
----------------
ratio: 10.526%

jijik: 1 	 -> diff. from max.: 237
jijik~: 67 	 -> diff. from max.: 171
----------------
ratio: 1.493%

takut: 5 	 -> diff. from max.: 233
takut~: 42 	 -> diff. from max.: 196
----------------
ratio: 11.905%

senang: 434 	 -> diff. from max.: -196
senang~: 238 	 -> diff. from max.: 0
----------------
ratio: 182.353%

sedih: 82 	 -> diff. from max.: 156
sedih~: 130 	 -> diff. from max.: 108
----------------
ratio: 63.077%

terkejut: 16 	 -> diff. from max.: 222
terkejut~: 210 	 -> diff. from max.: 28
----------------
ratio: 7.619%

biasa: 43 	 -> diff. from max.: 195
biasa~: 0 	 -> diff. from max.: 238
----------------
ratio: 4300.000%



In [38]:
print("Emotions distribution:")
df['train'][emotion_cols].sum().sort_values(ascending=False)

Emotions distribution:


senang      481
terkejut    170
sedih       158
marah        70
jijik        58
takut        42
biasa        30
dtype: int64