# Libraries

In [1]:
import os
import random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from huggingface_hub import HfApi

# Config

In [8]:
seed = 42
# lang = 'eng'
lang = 'sun_go_emotions'
# raw_data_dir = './data/public_data/'
raw_data_dir = './data/augmented_data/'
preprocessed_data_dir = './data/preprocessed_data/'
split_sizes = [0.7, 0.15, 0.15]
assert sum(split_sizes) == 1.0
hf_data_id = 'alxxtexxr/SemEval2025-Task11-Dataset'
hf_data_config = lang + '_' + '_'.join([str(int(split_size * 100)) for split_size in split_sizes]) + '_stratify_v2'
print("Hugging Face dataset config:", hf_data_config)

Hugging Face dataset config: sun_go_emotions_70_15_15_stratify_v2


In [9]:
def set_seed(seed):
    # Set random seed for NumPy
    np.random.seed(seed)

    # Optionally set random seed for sklearn and Python's own random module
    random.seed(seed)

    # Set random seed for os
    os.environ['PYTHONHASHSEED'] = str(seed)

    print(f"Random seed set to: {seed}")

set_seed(seed)

Random seed set to: 42


# Data

## Preprocess Data

In [10]:
train_df = pd.read_csv(os.path.join(raw_data_dir, f'train/track_a/{lang}.csv'))
print("Training DF length:", len(train_df))
print()
train_df

Training DF length: 2495



Unnamed: 0,text,emotion,marah,jijik,takut,senang,sedih,terkejut,biasa
0,Aya randa ker nguseup Pantun sunda meuni reuseup,senang,0,0,0,1,0,0,0
1,pastina ath mang ku abdi shere ken knu grup + SW,biasa,0,0,0,0,0,0,1
2,Mang Fiksi téh urang Majalengka ogé?,senang,0,0,0,1,0,0,0
3,"mang dana ,uing can ngopi, kumaha ieu?",senang,0,0,0,1,0,0,0
4,Sedih Mang Ai kudu D Caritakeun Mah😄🙏,"senang, sedih",0,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...
2490,"Nya, anjeun geus méakkeun dua jam.",biasa,0,0,0,0,0,0,1
2491,Kuring teu maca artikel anjeun tapi kuring nya...,biasa,0,0,0,0,0,0,1
2492,Aya sababaraha variasi sapanjang taun.,biasa,0,0,0,0,0,0,1
2493,Kuring mendakan sahanteuna hiji putri duyung u...,biasa,0,0,0,0,0,0,1


In [12]:
emotion_col_map = {
    'eng': { 'Anger': 'anger', 'Fear': 'fear', 'Joy': 'joy', 'Sadness': 'sad', 'Surprise': 'surprise' },
    'deu': { 'Anger': 'wut', 'Disgust': 'ekel', 'Fear': 'angst', 'Joy': 'freude', 'Sadness': 'trauer', 'Surprise': 'überraschung' },
    'sun': { 'Anger': 'marah', 'Disgust': 'jijik', 'Fear': 'takut', 'Joy': 'senang', 'Sadness': 'sedih', 'Surprise': 'terkejut' },
    'sun_go_emotions': { 'Anger': 'marah', 'Disgust': 'jijik', 'Fear': 'takut', 'Joy': 'senang', 'Sadness': 'sedih', 'Surprise': 'terkejut' },
}
emotion_cols = list(emotion_col_map[lang].values())

neutral_emotion_map = {
    'eng': 'neutral',
    'deu': 'neutral',
    'sun': 'biasa',
    'sun_go_emotions': 'biasa',
}
neutral_emotion = neutral_emotion_map[lang]

# Rename emotion columns
train_df = train_df.rename(columns=emotion_col_map[lang])

# Create 'emotion' column by combining the positive emotions
train_df['emotion'] = train_df.apply(lambda row: ', '.join([col for col in emotion_cols if row[col] == 1]), axis=1)
train_df['emotion'] = train_df['emotion'].replace('', neutral_emotion) # Fill neutral emotion

# Create neutral emotion column
train_df[neutral_emotion_map[lang]] = (train_df['emotion'] == neutral_emotion).astype(int)
emotion_cols += [neutral_emotion]
print("Emotion columns:", emotion_cols)
train_df.head()

Emotion columns: ['marah', 'jijik', 'takut', 'senang', 'sedih', 'terkejut', 'biasa']


Unnamed: 0,text,emotion,marah,jijik,takut,senang,sedih,terkejut,biasa
0,Aya randa ker nguseup Pantun sunda meuni reuseup,senang,0,0,0,1,0,0,0
1,pastina ath mang ku abdi shere ken knu grup + SW,biasa,0,0,0,0,0,0,1
2,Mang Fiksi téh urang Majalengka ogé?,senang,0,0,0,1,0,0,0
3,"mang dana ,uing can ngopi, kumaha ieu?",senang,0,0,0,1,0,0,0
4,Sedih Mang Ai kudu D Caritakeun Mah😄🙏,"senang, sedih",0,0,0,1,1,0,0


In [13]:
# Stratified split the data
def create_stratify_col(df):
    # Create 'stratify' column for stratified split
    df['stratify'] = df['emotion']

    # Identify classes with only one member
    single_class = df['emotion'].value_counts()[df['emotion'].value_counts() == 1].index

    # Assign a dummy value for the 'stratify' column for these classes
    df.loc[df['emotion'].isin(single_class), 'stratify'] = 'dummy'

create_stratify_col(train_df)

emotion_lt_7 = train_df['emotion'].value_counts()[(train_df['emotion'].value_counts() < 7)].index.tolist()
emotion_lt_7_cond = train_df['emotion'].isin(emotion_lt_7)
train_df_emotion_lt_7 = train_df[emotion_lt_7_cond]
train_df = train_df[~emotion_lt_7_cond]

# Split training DF into training and validation DFs
if len(split_sizes) == 3:
    train_df_, val_test_df = train_test_split(train_df[['text', 'emotion'] + emotion_cols],
                                              train_size=split_sizes[0],
                                              stratify=train_df['stratify'],
                                              random_state=seed)
    
    create_stratify_col(val_test_df)
    test_size = split_sizes[-1]/(split_sizes[1] + split_sizes[-1])
    val_df, test_df = train_test_split(val_test_df[['text', 'emotion'] + emotion_cols],
                                        test_size=test_size,
                                        stratify=val_test_df['stratify'],
                                        random_state=seed)
    
    print("Training DF length (splitted):", len(train_df_), "-->", train_df_.columns.tolist())
    print("Validation DF length:", len(val_df), "-->", val_df.columns.tolist())
    print("Testing DF length:", len(test_df), "-->", test_df.columns.tolist())

train_df_ = pd.concat([train_df_, train_df_emotion_lt_7[['text', 'emotion'] + emotion_cols]])

# dev_df = pd.read_csv(os.path.join(raw_data_dir, f'dev/track_a/{lang}_a.csv'))
# dev_df = dev_df.rename(columns=emotion_col_map[lang])
# dev_df['emotion'] = None
# dev_df = dev_df[['text', 'emotion'] + emotion_cols]
# print("Dev. DF length:", len(dev_df), "-->", dev_df.columns.tolist())

Training DF length (splitted): 1702 --> ['text', 'emotion', 'marah', 'jijik', 'takut', 'senang', 'sedih', 'terkejut', 'biasa']
Validation DF length: 365 --> ['text', 'emotion', 'marah', 'jijik', 'takut', 'senang', 'sedih', 'terkejut', 'biasa']
Testing DF length: 365 --> ['text', 'emotion', 'marah', 'jijik', 'takut', 'senang', 'sedih', 'terkejut', 'biasa']


In [14]:
test_df['emotion'].value_counts()

emotion
senang                     65
sedih                      40
terkejut                   38
biasa                      38
takut                      37
marah                      37
jijik                      33
senang, terkejut           20
marah, jijik               14
senang, sedih               7
takut, sedih                6
marah, sedih                4
jijik, takut                3
jijik, sedih                3
sedih, terkejut             3
jijik, terkejut             2
takut, terkejut             2
senang, sedih, terkejut     2
marah, terkejut             2
jijik, senang               2
marah, takut                2
marah, jijik, terkejut      2
marah, sedih, terkejut      1
takut, senang, terkejut     1
takut, senang               1
Name: count, dtype: int64

## Save Preprocessed Data

In [15]:
save_dir = os.path.join(preprocessed_data_dir, 'track_a', hf_data_config)

!mkdir -p $save_dir

train_df_.to_csv(os.path.join(save_dir, 'train.csv'), index=False)
val_df.to_csv(os.path.join(save_dir, 'val.csv'), index=False)
test_df.to_csv(os.path.join(save_dir, 'test.csv'), index=False)
# dev_df.to_csv(os.path.join(save_dir, 'dev.csv'), index=False)

print("Saved to:", save_dir)

Saved to: ./data/preprocessed_data/track_a/sun_go_emotions_70_15_15_stratify_v2


## Upload Preprocessed Data to Hugging Face

In [16]:
hf_api = HfApi()
hf_api.upload_folder(
    repo_id=hf_data_id,
    repo_type='dataset',
    folder_path=save_dir,
    path_in_repo=os.path.join('preprocessed_data/track_a', hf_data_config),
)

CommitInfo(commit_url='https://huggingface.co/datasets/alxxtexxr/SemEval2025-Task11-Dataset/commit/960dfef1e5e565e1b4debdfb7fc4c550996cef54', commit_message='Upload folder using huggingface_hub', commit_description='', oid='960dfef1e5e565e1b4debdfb7fc4c550996cef54', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/alxxtexxr/SemEval2025-Task11-Dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='alxxtexxr/SemEval2025-Task11-Dataset'), pr_revision=None, pr_num=None)