# Libraries

In [38]:
import os
import random
import numpy as np
import pandas as pd
import torch
import transformers
from datasets import load_dataset

# Config

In [36]:
seed = 42

hf_data_id = 'alxxtexxr/SemEval2025-Task11-Dataset'
# hf_data_config = 'track_a_sun_70_15_15_stratify_v2'
hf_data_config = 'track_a_sun_go_emotions_70_15_15_stratify_v2'

# add_data_paths = [
#     'data/go_emotions_sun/fear_single_242_comb_79_translated_cleaned_curated_merged_final.csv',
#     'data/go_emotions_sun/anger_single_237_comb_73_translated_cleaned_curated_merged_final.csv',
#     'data/go_emotions_sun/disgust_single_210_comb_35_translated_cleaned_curated_merged_final.csv',
#     'data/go_emotions_sun/sadness_single_183_comb_20_translated_cleaned_curated_merged_final.csv',
#     'data/go_emotions_sun/surprise_single_240_comb_22_translated_cleaned_curated_merged_final.csv',
#     'data/go_emotions_sun/neutral_single_210_comb_0_translated_cleaned_curated_merged_final.csv',
# ]

In [39]:
def set_seed(seed):
    # Set random seed for NumPy
    np.random.seed(seed)

    # Set random seed for Torch
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if using multi-GPU
    torch.backends.cudnn.deterministic = True  # Ensures deterministic results
    torch.backends.cudnn.benchmark = False  # Avoids non-deterministic algorithms

    # Set random seed for Transformers
    transformers.set_seed(seed)

    # Optionally set random seed for sklearn and Python's own random module
    random.seed(seed)

    # Set random seed for os
    os.environ['PYTHONHASHSEED'] = str(seed)

    print(f"Random seed set to: {seed}")

set_seed(seed)

Random seed set to: 42


# Data

## Load Data

In [3]:
datasets = load_dataset(hf_data_id, hf_data_config)

cols = list(datasets['train'].features)
emotion_cols = [col for col in cols if col not in ['Unnamed: 0', 'text', 'emotion']]
splits = [*datasets.keys()]

print("Splits:", splits)
print("Data columns:", cols)
print("Emotions columns:", emotion_cols)

README.md:   0%|          | 0.00/2.42k [00:00<?, ?B/s]

(…)_emotions_70_15_15_stratify_v2/train.csv:   0%|          | 0.00/179k [00:00<?, ?B/s]

(…)go_emotions_70_15_15_stratify_v2/val.csv:   0%|          | 0.00/35.0k [00:00<?, ?B/s]

(…)o_emotions_70_15_15_stratify_v2/test.csv:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating val split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Splits: ['train', 'val', 'test']
Data columns: ['text', 'emotion', 'marah', 'jijik', 'takut', 'senang', 'sedih', 'terkejut', 'biasa']
Emotions columns: ['marah', 'jijik', 'takut', 'senang', 'sedih', 'terkejut', 'biasa']


In [4]:
df = {split: pd.DataFrame(datasets[split]) for split in splits}
full_df_ = pd.concat(df.values())
print("Full DF size:", len(full_df_))
full_df_.head()

Full DF size: 2495


Unnamed: 0,text,emotion,marah,jijik,takut,senang,sedih,terkejut,biasa
0,Ngobrol ka kolotna meureun lamun éta pilihan.,biasa,0,0,0,0,0,0,1
1,Urang mah sok bingung ka batur anu ngomong nye...,"senang, sedih",0,0,0,1,1,0,0
2,"""Bédana téh lantaran abdi pantes narimana."" Ie...",marah,1,0,0,0,0,0,0
3,Aranjeun téh barudak teuweul anu ambek sabab m...,"marah, jijik",1,1,0,0,0,0,0
4,Anjeun teu nyaho naon fakta téh?,biasa,0,0,0,0,0,0,1


In [12]:
if 'add_data_paths' in globals():
    add_df = pd.concat([pd.read_csv(add_data_path) for add_data_path in globals()['add_data_paths']])
    add_df.drop(['id', 'curation_status', 'text'], axis=1, inplace=True)
    add_df.rename(columns={'text_translated': 'text'}, inplace=True)
    add_df['emotion'] = add_df.apply(lambda row: ', '.join([col for col in emotion_cols if row[col] == 1]), axis=1)
    print("Additional DF size:", len(add_df))
    
    full_df = pd.concat([full_df_, add_df])
    print("Full DF size (with additional data):", len(full_df))
else:
    full_df = full_df_

## EDA

In [73]:
print("Distribution of emotions:")
full_df[emotion_cols].sum().sort_values(ascending=False)

Distribution of emotions:


senang      677
terkejut    509
sedih       482
marah       459
jijik       418
takut       378
biasa       253
dtype: int64

In [74]:
print("Emotion combinations distribution:")
full_df['emotion'].value_counts()

Emotion combinations distribution:


emotion
senang                                  434
sedih                                   266
terkejut                                256
biasa                                   253
takut                                   248
marah                                   246
jijik                                   221
senang, terkejut                        132
marah, jijik                             94
senang, sedih                            45
takut, sedih                             44
marah, sedih                             31
jijik, takut                             20
sedih, terkejut                          20
jijik, sedih                             20
takut, terkejut                          14
marah, takut                             14
marah, terkejut                          13
senang, sedih, terkejut                  11
jijik, senang                            10
jijik, terkejut                          10
marah, jijik, terkejut                    8
marah, sedih, terkejut  

In [52]:
full_df['num_emotions'] = full_df.apply(lambda row: int(sum(row[emotion_cols].tolist())), axis=1)

single_emotion_totals = [len(full_df[(full_df[emotion_col] == 1) & (full_df['num_emotions'] == 1)]) for emotion_col in emotion_cols]
comb_emotion_totals = [len(full_df[(full_df[emotion_col] == 1) & (full_df['num_emotions'] > 1)]) for emotion_col in emotion_cols]

min_single_emotion_total = float('inf')

print("Distribution of data with single emotion and combined emotions:\n")
for emotion, single_emotion_total, comb_emotion_total in zip(emotion_cols, single_emotion_totals, comb_emotion_totals):
    if single_emotion_total < min_single_emotion_total:
        min_single_emotion_total = single_emotion_total

    diff_from_max_single = max(comb_emotion_totals) - single_emotion_total
    diff_from_max_comb = max(comb_emotion_totals) - comb_emotion_total
    ratio = single_emotion_total / (comb_emotion_total if comb_emotion_total else 1) * 100
    total = single_emotion_total + comb_emotion_total

    print(f"{emotion:<15}:", single_emotion_total, "\t-> Difference from max:", diff_from_max_single, end="")
    print()

    print(f"{emotion + ' + ...':<15}:", comb_emotion_total, end="")
    if emotion != "biasa":
        print("\t-> Difference from max:", diff_from_max_comb, end="")
    print()

    print("-"*15)
    print("Total:", total)

    if emotion != "biasa":
        print(f"Ratio: {ratio:.3f}%")
    print()

print("Min. single emotion total:", min_single_emotion_total)

Distribution of data with single emotion and combined emotions:

marah          : 246 	-> Difference from max: 7
marah + ...    : 213	-> Difference from max: 40
---------------
Total: 459
Ratio: 115.493%

jijik          : 221 	-> Difference from max: 32
jijik + ...    : 197	-> Difference from max: 56
---------------
Total: 418
Ratio: 112.183%

takut          : 248 	-> Difference from max: 5
takut + ...    : 130	-> Difference from max: 123
---------------
Total: 378
Ratio: 190.769%

senang         : 434 	-> Difference from max: -181
senang + ...   : 243	-> Difference from max: 10
---------------
Total: 677
Ratio: 178.601%

sedih          : 266 	-> Difference from max: -13
sedih + ...    : 216	-> Difference from max: 37
---------------
Total: 482
Ratio: 123.148%

terkejut       : 256 	-> Difference from max: -3
terkejut + ... : 253	-> Difference from max: 0
---------------
Total: 509
Ratio: 101.186%

biasa          : 253 	-> Difference from max: 0
biasa + ...    : 0
---------------
Total

In [72]:
balanced_dfs = []

for emotion in emotion_cols:
    single_emotion_df = full_df[(full_df[emotion] == 1) & (full_df['num_emotions'] == 1)]
    print(f"Single {emotion} DF:", len(single_emotion_df))
    single_emotion_df_balanced = single_emotion_df.sample(n=min_single_emotion_total, random_state=seed)
    print(f"Single {emotion} DF (after balancing):", len(single_emotion_df_balanced))

    comb_emotion_df = full_df[(full_df[emotion] == 1) & (full_df['num_emotions'] > 1)]
    print(f"Combination {emotion} DF:", len(comb_emotion_df))
    comb_emotion_df_balanced = comb_emotion_df.sample(n=min_single_emotion_total, random_state=seed) if len(comb_emotion_df) > min_single_emotion_total else comb_emotion_df
    print(f"Combination {emotion} DF (after balancing):", len(comb_emotion_df_balanced))

    total = len(single_emotion_df_balanced) + len(comb_emotion_df_balanced)

    print("-" * 46)
    print("Total:", total)

    balanced_dfs += [single_emotion_df_balanced, comb_emotion_df_balanced]

    print()

assert len(balanced_dfs) == len(emotion_cols) * 2

Single marah DF: 246
Single marah DF (after balancing): 221
Combination marah DF: 213
Combination marah DF (after balancing): 213
----------------------------------------------
Total: 434

Single jijik DF: 221
Single jijik DF (after balancing): 221
Combination jijik DF: 197
Combination jijik DF (after balancing): 197
----------------------------------------------
Total: 418

Single takut DF: 248
Single takut DF (after balancing): 221
Combination takut DF: 130
Combination takut DF (after balancing): 130
----------------------------------------------
Total: 351

Single senang DF: 434
Single senang DF (after balancing): 221
Combination senang DF: 243
Combination senang DF (after balancing): 221
----------------------------------------------
Total: 442

Single sedih DF: 266
Single sedih DF (after balancing): 221
Combination sedih DF: 216
Combination sedih DF (after balancing): 216
----------------------------------------------
Total: 437

Single terkejut DF: 256
Single terkejut DF (after b

In [82]:
balanced_df = pd.concat(balanced_dfs)
balanced_df = balanced_df[~balanced_df.index.duplicated(keep='first')]

print("DF size:", len(full_df))
print("DF size (after balancing)", len(balanced_df))

DF size: 2495
DF size (after balancing) 1551


In [83]:
save_df = balanced_df.drop('num_emotions', axis=1)
save_df.to_csv('sun_go_emotions_balanced.csv', index=False)

In [444]:
# save_df = full_df.drop('num_emotions', axis=1)
# save_df.to_csv('sun_go_emotions.csv', index=False)