# Libraries

In [427]:
import pandas as pd
from datasets import load_dataset

# Config

In [428]:
hf_data_id = 'alxxtexxr/SemEval2025-Task11-Dataset'
hf_data_config = 'track_a_sun_70_15_15_stratify_v2'

add_data_paths = [
    'data/go_emotions_sun/fear_single_242_comb_79_translated_cleaned_curated_merged_final.csv',
    'data/go_emotions_sun/anger_single_237_comb_73_translated_cleaned_curated_merged_final.csv',
    'data/go_emotions_sun/disgust_single_210_comb_35_translated_cleaned_curated_merged_final.csv',
    'data/go_emotions_sun/sadness_single_183_comb_20_translated_cleaned_curated_merged_final.csv',
    'data/go_emotions_sun/surprise_single_240_comb_22_translated_cleaned_curated_merged_final.csv',
    'data/go_emotions_sun/neutral_single_210_comb_0_translated_cleaned_curated_merged_final.csv',
]

# Data

## Load Data

In [429]:
datasets = load_dataset(hf_data_id, hf_data_config)

cols = list(datasets['train'].features)
emotion_cols = [col for col in cols if col not in ['Unnamed: 0', 'text', 'emotion']]
splits = [*datasets.keys()]

print("Splits:", splits)
print("Data columns:", cols)
print("Emotions columns:", emotion_cols)

Splits: ['train', 'val', 'test']
Data columns: ['text', 'emotion', 'marah', 'jijik', 'takut', 'senang', 'sedih', 'terkejut', 'biasa']
Emotions columns: ['marah', 'jijik', 'takut', 'senang', 'sedih', 'terkejut', 'biasa']


In [430]:
df = {split: pd.DataFrame(datasets[split]) for split in splits}
full_df_ = pd.concat(df.values())
print("Full DF size:", len(full_df_))
full_df_.head()

Full DF size: 924


Unnamed: 0,text,emotion,marah,jijik,takut,senang,sedih,terkejut,biasa
0,Aya randa ker nguseup Pantun sunda meuni reuseup,senang,0,0,0,1,0,0,0
1,pastina ath mang ku abdi shere ken knu grup + SW,biasa,0,0,0,0,0,0,1
2,Mang Fiksi téh urang Majalengka ogé?,senang,0,0,0,1,0,0,0
3,"mang dana ,uing can ngopi, kumaha ieu?",senang,0,0,0,1,0,0,0
4,Sedih Mang Ai kudu D Caritakeun Mah😄🙏,"senang, sedih",0,0,0,1,1,0,0


In [431]:
if len(add_data_paths):
    add_df = pd.concat([pd.read_csv(add_data_path) for add_data_path in add_data_paths])
    add_df.drop(['id', 'curation_status', 'text'], axis=1, inplace=True)
    add_df.rename(columns={'text_translated': 'text'}, inplace=True)
    add_df['emotion'] = add_df.apply(lambda row: ', '.join([col for col in emotion_cols if row[col] == 1]), axis=1)
    print("Additional DF size:", len(add_df))
    
    full_df = pd.concat([full_df_, add_df])
    print("Full DF size (with additional data):", len(full_df))
else:
    full_df = full_df_

Additional DF size: 1571
Full DF size (with additional data): 2495


## EDA

In [432]:
full_df['num_emotions'] = full_df.apply(lambda row: int(sum(row[emotion_cols].tolist())), axis=1)

single_emotion_totals = [len(full_df[(full_df[emotion_col] == 1) & (full_df['num_emotions'] == 1)]) for emotion_col in emotion_cols]
comb_emotion_totals = [len(full_df[(full_df[emotion_col] == 1) & (full_df['num_emotions'] > 1)]) for emotion_col in emotion_cols]

print("Distribution of data with single emotion and combined emotions:\n")
for emotion, single_emotion_total, comb_emotion_total in zip(emotion_cols, single_emotion_totals, comb_emotion_totals):
    diff_from_max_single = max(comb_emotion_totals) - single_emotion_total
    diff_from_max_comb = max(comb_emotion_totals) - comb_emotion_total
    ratio = single_emotion_total / (comb_emotion_total if comb_emotion_total else 1) * 100

    print(f"{emotion:<15}:", single_emotion_total, "\t-> Difference from max:", diff_from_max_single, end="")
    print()

    print(f"{emotion + ' + ...':<15}:", comb_emotion_total, end="")
    if emotion != "biasa":
        print("\t-> Difference from max:", diff_from_max_comb, end="")
    print()

    if emotion != "biasa":
        print("-"*15)
        print(f"Ratio: {ratio:.3f}%")
    print()

Distribution of data with single emotion and combined emotions:

marah          : 246 	-> Difference from max: 7
marah + ...    : 213	-> Difference from max: 40
---------------
Ratio: 115.493%

jijik          : 221 	-> Difference from max: 32
jijik + ...    : 197	-> Difference from max: 56
---------------
Ratio: 112.183%

takut          : 248 	-> Difference from max: 5
takut + ...    : 130	-> Difference from max: 123
---------------
Ratio: 190.769%

senang         : 434 	-> Difference from max: -181
senang + ...   : 243	-> Difference from max: 10
---------------
Ratio: 178.601%

sedih          : 266 	-> Difference from max: -13
sedih + ...    : 216	-> Difference from max: 37
---------------
Ratio: 123.148%

terkejut       : 256 	-> Difference from max: -3
terkejut + ... : 253	-> Difference from max: 0
---------------
Ratio: 101.186%

biasa          : 253 	-> Difference from max: 0
biasa + ...    : 0



In [435]:
print("Distribution of emotions:")
full_df[emotion_cols].sum().sort_values(ascending=False)

Distribution of emotions:


senang      677
terkejut    509
sedih       482
marah       459
jijik       418
takut       378
biasa       253
dtype: int64

In [436]:
print("Emotion combinations distribution:")
full_df['emotion'].value_counts()

Emotion combinations distribution:


emotion
senang                                  434
sedih                                   266
terkejut                                256
biasa                                   253
takut                                   248
marah                                   246
jijik                                   221
senang, terkejut                        132
marah, jijik                             94
senang, sedih                            45
takut, sedih                             44
marah, sedih                             31
sedih, terkejut                          20
jijik, sedih                             20
jijik, takut                             20
takut, terkejut                          14
marah, takut                             14
marah, terkejut                          13
senang, sedih, terkejut                  11
jijik, terkejut                          10
jijik, senang                            10
marah, sedih, terkejut                    8
marah, jijik, terkejut  

In [444]:
# save_df = full_df.drop('num_emotions', axis=1)
# save_df.to_csv('sun_go_emotions.csv', index=False)