# Libraries

In [110]:
import os
import random
import numpy as np
import pandas as pd
import torch
import transformers
import re
import unicodedata
from datasets import load_dataset

# Config

In [111]:
seed = 42

# hf_data_id = 'alxxtexxr/SemEval2025-Task11-Dataset'
# hf_data_config = 'track_a_sun_70_15_15_stratify_v2'
raw_data_path = 'data/preprocessed_data_raw/track_a/sun/train.csv'

aug_type = 'go_emotions'
aug_data_paths = [
    'data/go_emotions_sun/fear_single_242_comb_79_translated_cleaned_curated_merged_final.csv',
    'data/go_emotions_sun/anger_single_237_comb_73_translated_cleaned_curated_merged_final.csv',
    'data/go_emotions_sun/disgust_single_210_comb_35_translated_cleaned_curated_merged_final.csv',
    'data/go_emotions_sun/sadness_single_183_comb_20_translated_cleaned_curated_merged_final.csv',
    'data/go_emotions_sun/surprise_single_240_comb_22_translated_cleaned_curated_merged_final.csv',
    'data/go_emotions_sun/neutral_single_210_comb_0_translated_cleaned_curated_merged_final.csv',
]

eng2idn_emotion_map = {
    'anger': 'marah', 
    'disgust': 'jijik', 
    'fear': 'takut', 
    'joy': 'senang', 
    'sadness': 'sedih', 
    'surprise': 'terkejut', 
    'neutral': 'biasa',
}

In [112]:
def set_seed(seed):
    # Set random seed for NumPy
    np.random.seed(seed)

    # Set random seed for Torch
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if using multi-GPU
    torch.backends.cudnn.deterministic = True  # Ensures deterministic results
    torch.backends.cudnn.benchmark = False  # Avoids non-deterministic algorithms

    # Set random seed for Transformers
    transformers.set_seed(seed)

    # Optionally set random seed for sklearn and Python's own random module
    random.seed(seed)

    # Set random seed for os
    os.environ['PYTHONHASHSEED'] = str(seed)

    print(f"Random seed set to: {seed}")

set_seed(seed)

Random seed set to: 42


# Data

## Load Data

In [113]:
full_df_ = pd.read_csv(raw_data_path)
full_df_ = full_df_.rename(columns=eng2idn_emotion_map)

full_df_.head()

Unnamed: 0,id,text,marah,jijik,takut,senang,sedih,terkejut
0,sun_train_track_a_00001,Kumaha mang fiksi engke ka sabilulungan mang s...,0,0,0,1,0,1
1,sun_train_track_a_00002,tapi domba anakan namah lain ku kuring mreun. ...,0,0,0,1,0,1
2,sun_train_track_a_00003,Aduh mang naha bet kudu penting di upload ma😂😂,0,0,0,1,0,0
3,sun_train_track_a_00004,"pokonamah nuhun sabandungeun , kita terus ting...",0,0,0,1,1,0
4,sun_train_track_a_00005,Kang eta teu isin?? Apa emng tukang ngisinkeun...,0,0,0,1,0,1


In [114]:
# datasets = load_dataset(hf_data_id, hf_data_config)

# cols = list(datasets['train'].features)
cols = list(full_df_.columns)
emotion_cols = [col for col in cols if col not in ['Unnamed: 0', 'text', 'emotion', 'id']]
# splits = [*datasets.keys()]

# print("Splits:", splits)
print("Data columns:", cols)
print("Emotions columns:", emotion_cols)

Data columns: ['id', 'text', 'marah', 'jijik', 'takut', 'senang', 'sedih', 'terkejut']
Emotions columns: ['marah', 'jijik', 'takut', 'senang', 'sedih', 'terkejut']


In [115]:
# df = {split: pd.DataFrame(datasets[split]) for split in splits}
# full_df_ = pd.concat(df.values())
# print("Full DF size:", len(full_df_))
# full_df_.head()

In [116]:
if 'aug_data_paths' in globals() or 'aug_type' in globals():
    aug_df = pd.concat([pd.read_csv(aug_data_path) for aug_data_path in globals()['aug_data_paths']])
    aug_df.drop(['id', 'curation_status', 'text', 'num_emotions'], axis=1, inplace=True)
    aug_df.rename(columns={'text_translated': 'text'}, inplace=True)
    aug_df['emotion'] = aug_df.apply(lambda row: ', '.join([col for col in emotion_cols if row[col] == 1]), axis=1)
    aug_df[f'aug_{globals()['aug_type']}'] = 1
    print("Augmentation DF size:", len(aug_df))
    
    full_df_[f'aug_{globals()['aug_type']}'] = 0
    full_df = pd.concat([full_df_, aug_df])
    print("Full DF size (after augmentation):", len(full_df))
else:
    full_df = full_df_

full_df.head()

Augmentation DF size: 1571
Full DF size (after augmentation): 2494


Unnamed: 0,id,text,marah,jijik,takut,senang,sedih,terkejut,aug_go_emotions,biasa,emotion
0,sun_train_track_a_00001,Kumaha mang fiksi engke ka sabilulungan mang s...,0,0,0,1,0,1,0,,
1,sun_train_track_a_00002,tapi domba anakan namah lain ku kuring mreun. ...,0,0,0,1,0,1,0,,
2,sun_train_track_a_00003,Aduh mang naha bet kudu penting di upload ma😂😂,0,0,0,1,0,0,0,,
3,sun_train_track_a_00004,"pokonamah nuhun sabandungeun , kita terus ting...",0,0,0,1,1,0,0,,
4,sun_train_track_a_00005,Kang eta teu isin?? Apa emng tukang ngisinkeun...,0,0,0,1,0,1,0,,


## EDA

In [117]:
def remove_emojis_and_symbols(text):
    # Regex pattern to match emojis and symbols with variation selectors
    emoji_pattern = re.compile(
        "[\U00010000-\U0010ffff\U00002000-\U00002BFF\U00002702-\U000027B0]+", 
        flags=re.UNICODE)
    # Remove emojis
    text = emoji_pattern.sub('', text)
    # Remove any non-ASCII symbols (except regular punctuation)
    text = ''.join(char for char in text if ord(char) < 128 and unicodedata.category(char) != 'Mn')
    return text

def contains_non_ascii(text):
    # Remove emojis and symbols first
    text = remove_emojis_and_symbols(text)
    try:
        # Check if the remaining text contains non-ASCII characters
        text.encode('ascii')
    except UnicodeEncodeError:
        return True
    return False

print("Total data with non-ASCII chars:", int(full_df['text'].apply(contains_non_ascii).sum()))

Total data with non-ASCII chars: 0


In [118]:
def normalize_to_ascii(text):
    # Normalize the text to decompose accented characters
    text = unicodedata.normalize('NFKD', text)
    # Remove only combining marks (like accents) while keeping emojis and other symbols
    text = ''.join(char for char in text if not unicodedata.combining(char))
    return text

# Normalize to ASCII equivalents
full_df['text'] = full_df['text'].apply(normalize_to_ascii)
print("Total data with non-ASCII chars (after normalizing them):", int(full_df['text'].apply(contains_non_ascii).sum()))

full_df.head()

Total data with non-ASCII chars (after normalizing them): 0


Unnamed: 0,id,text,marah,jijik,takut,senang,sedih,terkejut,aug_go_emotions,biasa,emotion
0,sun_train_track_a_00001,Kumaha mang fiksi engke ka sabilulungan mang s...,0,0,0,1,0,1,0,,
1,sun_train_track_a_00002,tapi domba anakan namah lain ku kuring mreun. ...,0,0,0,1,0,1,0,,
2,sun_train_track_a_00003,Aduh mang naha bet kudu penting di upload ma😂😂,0,0,0,1,0,0,0,,
3,sun_train_track_a_00004,"pokonamah nuhun sabandungeun , kita terus ting...",0,0,0,1,1,0,0,,
4,sun_train_track_a_00005,Kang eta teu isin?? Apa emng tukang ngisinkeun...,0,0,0,1,0,1,0,,


In [119]:
print("Distribution of emotions:")
full_df[emotion_cols].sum().sort_values(ascending=False)

Distribution of emotions:


senang      677
terkejut    509
sedih       482
marah       458
jijik       417
takut       378
dtype: int64

In [120]:
print("Emotion combinations distribution:")
full_df['emotion'].value_counts()

Emotion combinations distribution:


emotion
takut                            243
terkejut                         240
marah                            238
jijik                            220
                                 210
sedih                            184
marah, jijik                      79
takut, sedih                      33
marah, sedih                      22
jijik, takut                      20
jijik, sedih                      15
marah, takut                      13
marah, terkejut                   13
takut, terkejut                   12
jijik, terkejut                    8
sedih, terkejut                    4
senang, sedih                      3
marah, jijik, takut                3
marah, takut, sedih, terkejut      2
marah, takut, sedih                2
marah, jijik, takut, sedih         1
marah, jijik, sedih, terkejut      1
jijik, takut, sedih                1
takut, senang, terkejut            1
senang, terkejut                   1
jijik, sedih, terkejut             1
marah, jijik, sedih           

In [121]:
full_df['num_emotions'] = full_df.apply(lambda row: int(sum(row[emotion_cols].tolist())), axis=1)

single_emotion_totals = [len(full_df[(full_df[emotion_col] == 1) & (full_df['num_emotions'] == 1)]) for emotion_col in emotion_cols]
comb_emotion_totals = [len(full_df[(full_df[emotion_col] == 1) & (full_df['num_emotions'] > 1)]) for emotion_col in emotion_cols]

min_single_emotion_total = float('inf')

print("Distribution of data with single emotion and combined emotions:\n")
for emotion, single_emotion_total, comb_emotion_total in zip(emotion_cols, single_emotion_totals, comb_emotion_totals):
    if single_emotion_total < min_single_emotion_total:
        min_single_emotion_total = single_emotion_total

    diff_from_max_single = max(comb_emotion_totals) - single_emotion_total
    diff_from_max_comb = max(comb_emotion_totals) - comb_emotion_total
    ratio = single_emotion_total / (comb_emotion_total if comb_emotion_total else 1) * 100
    total = single_emotion_total + comb_emotion_total

    print(f"{emotion:<15}:", single_emotion_total, "\t-> Difference from max:", diff_from_max_single, end="")
    print()

    print(f"{emotion + ' + ...':<15}:", comb_emotion_total, end="")
    if emotion != "biasa":
        print("\t-> Difference from max:", diff_from_max_comb, end="")
    print()

    print("-"*15)
    print("Total:", total)

    if emotion != "biasa":
        print(f"Ratio: {ratio:.3f}%")
    print()

print("Min. single emotion total:", min_single_emotion_total)

Distribution of data with single emotion and combined emotions:

marah          : 246 	-> Difference from max: 7
marah + ...    : 212	-> Difference from max: 41
---------------
Total: 458
Ratio: 116.038%

jijik          : 221 	-> Difference from max: 32
jijik + ...    : 196	-> Difference from max: 57
---------------
Total: 417
Ratio: 112.755%

takut          : 248 	-> Difference from max: 5
takut + ...    : 130	-> Difference from max: 123
---------------
Total: 378
Ratio: 190.769%

senang         : 434 	-> Difference from max: -181
senang + ...   : 243	-> Difference from max: 10
---------------
Total: 677
Ratio: 178.601%

sedih          : 266 	-> Difference from max: -13
sedih + ...    : 216	-> Difference from max: 37
---------------
Total: 482
Ratio: 123.148%

terkejut       : 256 	-> Difference from max: -3
terkejut + ... : 253	-> Difference from max: 0
---------------
Total: 509
Ratio: 101.186%

Min. single emotion total: 221


## Balance Data

In [106]:
# balanced_dfs = []

# for emotion in emotion_cols:
#     single_emotion_df = full_df[(full_df[emotion] == 1) & (full_df['num_emotions'] == 1)]
#     print(f"Single {emotion} DF:", len(single_emotion_df))
#     single_emotion_df_balanced = single_emotion_df.sample(n=min_single_emotion_total, random_state=seed)
#     print(f"Single {emotion} DF (after balancing):", len(single_emotion_df_balanced))

#     comb_emotion_df = full_df[(full_df[emotion] == 1) & (full_df['num_emotions'] > 1)]
#     print(f"Combination {emotion} DF:", len(comb_emotion_df))
#     comb_emotion_df_balanced = comb_emotion_df.sample(n=min_single_emotion_total, random_state=seed) if len(comb_emotion_df) > min_single_emotion_total else comb_emotion_df
#     print(f"Combination {emotion} DF (after balancing):", len(comb_emotion_df_balanced))

#     total = len(single_emotion_df_balanced) + len(comb_emotion_df_balanced)

#     print("-" * 46)
#     print("Total:", total)

#     balanced_dfs += [single_emotion_df_balanced, comb_emotion_df_balanced]

#     print()

# assert len(balanced_dfs) == len(emotion_cols) * 2

In [107]:
# balanced_df = pd.concat(balanced_dfs)
# balanced_df = balanced_df[~balanced_df.index.duplicated(keep='first')]

# print("DF size:", len(full_df))
# print("DF size (after balancing)", len(balanced_df))

In [108]:
# save_df = balanced_df.drop('num_emotions', axis=1)
# save_df.to_csv('sun_go_emotions_balanced.csv', index=False)

## Save Data

In [122]:
save_path = 'sun_go_emotions_v3.csv'
save_df = full_df.drop('num_emotions', axis=1)
save_df.to_csv(save_path, index=False)
print("Saved to:", save_path)

Saved to: sun_go_emotions_v3.csv
