# Libraries

In [19]:
import os
import random
import numpy as np
import pandas as pd
import torch
import transformers
import unicodedata
from datasets import load_dataset

# Config

In [20]:
seed = 42

hf_data_id = 'alxxtexxr/SemEval2025-Task11-Dataset'
# hf_data_config = 'track_a_sun_70_15_15_stratify_v2'
hf_data_config = 'track_a_sun_go_emotions_70_15_15_stratify_v2'

add_type = 'go_emotions'
add_data_paths = [
    'data/go_emotions_sun/fear_single_242_comb_79_translated_cleaned_curated_merged_final.csv',
    'data/go_emotions_sun/anger_single_237_comb_73_translated_cleaned_curated_merged_final.csv',
    'data/go_emotions_sun/disgust_single_210_comb_35_translated_cleaned_curated_merged_final.csv',
    'data/go_emotions_sun/sadness_single_183_comb_20_translated_cleaned_curated_merged_final.csv',
    'data/go_emotions_sun/surprise_single_240_comb_22_translated_cleaned_curated_merged_final.csv',
    'data/go_emotions_sun/neutral_single_210_comb_0_translated_cleaned_curated_merged_final.csv',
]

In [21]:
def set_seed(seed):
    # Set random seed for NumPy
    np.random.seed(seed)

    # Set random seed for Torch
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if using multi-GPU
    torch.backends.cudnn.deterministic = True  # Ensures deterministic results
    torch.backends.cudnn.benchmark = False  # Avoids non-deterministic algorithms

    # Set random seed for Transformers
    transformers.set_seed(seed)

    # Optionally set random seed for sklearn and Python's own random module
    random.seed(seed)

    # Set random seed for os
    os.environ['PYTHONHASHSEED'] = str(seed)

    print(f"Random seed set to: {seed}")

set_seed(seed)

Random seed set to: 42


# Data

## Load Data

In [22]:
datasets = load_dataset(hf_data_id, hf_data_config)

cols = list(datasets['train'].features)
emotion_cols = [col for col in cols if col not in ['Unnamed: 0', 'text', 'emotion']]
splits = [*datasets.keys()]

print("Splits:", splits)
print("Data columns:", cols)
print("Emotions columns:", emotion_cols)

Splits: ['train', 'val', 'test']
Data columns: ['text', 'emotion', 'marah', 'jijik', 'takut', 'senang', 'sedih', 'terkejut', 'biasa']
Emotions columns: ['marah', 'jijik', 'takut', 'senang', 'sedih', 'terkejut', 'biasa']


In [23]:
df = {split: pd.DataFrame(datasets[split]) for split in splits}
full_df_ = pd.concat(df.values())
print("Full DF size:", len(full_df_))
full_df_.head()

Full DF size: 2495


Unnamed: 0,text,emotion,marah,jijik,takut,senang,sedih,terkejut,biasa
0,Ngobrol ka kolotna meureun lamun éta pilihan.,biasa,0,0,0,0,0,0,1
1,Urang mah sok bingung ka batur anu ngomong nye...,"senang, sedih",0,0,0,1,1,0,0
2,"""Bédana téh lantaran abdi pantes narimana."" Ie...",marah,1,0,0,0,0,0,0
3,Aranjeun téh barudak teuweul anu ambek sabab m...,"marah, jijik",1,1,0,0,0,0,0
4,Anjeun teu nyaho naon fakta téh?,biasa,0,0,0,0,0,0,1


In [24]:
if 'add_data_paths' in globals():
    add_df = pd.concat([pd.read_csv(add_data_path) for add_data_path in globals()['add_data_paths']])
    add_df.drop(['id', 'curation_status', 'text', 'num_emotions'], axis=1, inplace=True)
    add_df.rename(columns={'text_translated': 'text'}, inplace=True)
    add_df['emotion'] = add_df.apply(lambda row: ', '.join([col for col in emotion_cols if row[col] == 1]), axis=1)
    add_df[f'aug_{add_type}'] = 1
    print("Additional DF size:", len(add_df))
    
    full_df_[f'aug_{add_type}'] = 0
    full_df = pd.concat([full_df_, add_df])
    print("Full DF size (with additional data):", len(full_df))
else:
    full_df = full_df_

full_df.head()

Additional DF size: 1571
Full DF size (with additional data): 4066


Unnamed: 0,text,emotion,marah,jijik,takut,senang,sedih,terkejut,biasa,aug_go_emotions
0,Ngobrol ka kolotna meureun lamun éta pilihan.,biasa,0,0,0,0,0,0,1,0
1,Urang mah sok bingung ka batur anu ngomong nye...,"senang, sedih",0,0,0,1,1,0,0,0
2,"""Bédana téh lantaran abdi pantes narimana."" Ie...",marah,1,0,0,0,0,0,0,0
3,Aranjeun téh barudak teuweul anu ambek sabab m...,"marah, jijik",1,1,0,0,0,0,0,0
4,Anjeun teu nyaho naon fakta téh?,biasa,0,0,0,0,0,0,1,0


## EDA

In [25]:
def contains_non_ascii(text):
    try:
        text.encode('ascii')
    except UnicodeEncodeError:
        return True
    return False

print("Total data with non-ASCII chars:", int(full_df['text'].apply(contains_non_ascii).sum()))

Total data with non-ASCII chars: 1933


In [26]:
def normalize_to_ascii(text):
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('ascii')

# Normalize to ASCII equivalents
full_df['text'] = full_df['text'].apply(normalize_to_ascii)
print("Total data with non-ASCII chars (after normalizing them):", int(full_df['text'].apply(contains_non_ascii).sum()))

Total data with non-ASCII chars (after normalizing them): 0


In [27]:
print("Distribution of emotions:")
full_df[emotion_cols].sum().sort_values(ascending=False)

Distribution of emotions:


marah       834
terkejut    792
jijik       768
sedih       752
takut       709
senang      682
biasa       463
dtype: int64

In [28]:
print("Emotion combinations distribution:")
full_df['emotion'].value_counts()

Emotion combinations distribution:


emotion
terkejut                                496
takut                                   491
marah                                   484
biasa                                   463
sedih                                   450
jijik                                   441
senang                                  434
marah, jijik                            173
senang, terkejut                        133
takut, sedih                             77
marah, sedih                             53
senang, sedih                            48
jijik, takut                             40
jijik, sedih                             35
marah, takut                             27
marah, terkejut                          26
takut, terkejut                          26
sedih, terkejut                          24
jijik, terkejut                          18
senang, sedih, terkejut                  11
jijik, senang                            10
marah, jijik, terkejut                    8
marah, sedih, terkejut  

In [29]:
full_df['num_emotions'] = full_df.apply(lambda row: int(sum(row[emotion_cols].tolist())), axis=1)

single_emotion_totals = [len(full_df[(full_df[emotion_col] == 1) & (full_df['num_emotions'] == 1)]) for emotion_col in emotion_cols]
comb_emotion_totals = [len(full_df[(full_df[emotion_col] == 1) & (full_df['num_emotions'] > 1)]) for emotion_col in emotion_cols]

min_single_emotion_total = float('inf')

print("Distribution of data with single emotion and combined emotions:\n")
for emotion, single_emotion_total, comb_emotion_total in zip(emotion_cols, single_emotion_totals, comb_emotion_totals):
    if single_emotion_total < min_single_emotion_total:
        min_single_emotion_total = single_emotion_total

    diff_from_max_single = max(comb_emotion_totals) - single_emotion_total
    diff_from_max_comb = max(comb_emotion_totals) - comb_emotion_total
    ratio = single_emotion_total / (comb_emotion_total if comb_emotion_total else 1) * 100
    total = single_emotion_total + comb_emotion_total

    print(f"{emotion:<15}:", single_emotion_total, "\t-> Difference from max:", diff_from_max_single, end="")
    print()

    print(f"{emotion + ' + ...':<15}:", comb_emotion_total, end="")
    if emotion != "biasa":
        print("\t-> Difference from max:", diff_from_max_comb, end="")
    print()

    print("-"*15)
    print("Total:", total)

    if emotion != "biasa":
        print(f"Ratio: {ratio:.3f}%")
    print()

print("Min. single emotion total:", min_single_emotion_total)

Distribution of data with single emotion and combined emotions:

marah          : 484 	-> Difference from max: -134
marah + ...    : 350	-> Difference from max: 0
---------------
Total: 834
Ratio: 138.286%

jijik          : 441 	-> Difference from max: -91
jijik + ...    : 327	-> Difference from max: 23
---------------
Total: 768
Ratio: 134.862%

takut          : 491 	-> Difference from max: -141
takut + ...    : 218	-> Difference from max: 132
---------------
Total: 709
Ratio: 225.229%

senang         : 434 	-> Difference from max: -84
senang + ...   : 248	-> Difference from max: 102
---------------
Total: 682
Ratio: 175.000%

sedih          : 450 	-> Difference from max: -100
sedih + ...    : 302	-> Difference from max: 48
---------------
Total: 752
Ratio: 149.007%

terkejut       : 496 	-> Difference from max: -146
terkejut + ... : 296	-> Difference from max: 54
---------------
Total: 792
Ratio: 167.568%

biasa          : 463 	-> Difference from max: -113
biasa + ...    : 0
--------

## Balance Data

In [30]:
# balanced_dfs = []

# for emotion in emotion_cols:
#     single_emotion_df = full_df[(full_df[emotion] == 1) & (full_df['num_emotions'] == 1)]
#     print(f"Single {emotion} DF:", len(single_emotion_df))
#     single_emotion_df_balanced = single_emotion_df.sample(n=min_single_emotion_total, random_state=seed)
#     print(f"Single {emotion} DF (after balancing):", len(single_emotion_df_balanced))

#     comb_emotion_df = full_df[(full_df[emotion] == 1) & (full_df['num_emotions'] > 1)]
#     print(f"Combination {emotion} DF:", len(comb_emotion_df))
#     comb_emotion_df_balanced = comb_emotion_df.sample(n=min_single_emotion_total, random_state=seed) if len(comb_emotion_df) > min_single_emotion_total else comb_emotion_df
#     print(f"Combination {emotion} DF (after balancing):", len(comb_emotion_df_balanced))

#     total = len(single_emotion_df_balanced) + len(comb_emotion_df_balanced)

#     print("-" * 46)
#     print("Total:", total)

#     balanced_dfs += [single_emotion_df_balanced, comb_emotion_df_balanced]

#     print()

# assert len(balanced_dfs) == len(emotion_cols) * 2

In [31]:
# balanced_df = pd.concat(balanced_dfs)
# balanced_df = balanced_df[~balanced_df.index.duplicated(keep='first')]

# print("DF size:", len(full_df))
# print("DF size (after balancing)", len(balanced_df))

In [32]:
# save_df = balanced_df.drop('num_emotions', axis=1)
# save_df.to_csv('sun_go_emotions_balanced.csv', index=False)

## Save Data

In [35]:
save_path = 'sun_go_emotions_v2.csv'
save_df = full_df.drop('num_emotions', axis=1)
save_df.to_csv(save_path, index=False)
print("Saved to:", save_path)

Saved to: sun_go_emotions_v2.csv
