# Libraries

In [1]:
import os
import random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from huggingface_hub import HfApi

# Config

In [2]:
seed = 42
# lang = 'eng'
lang = 'sun'
raw_data_dir = './data/public_data/'
preprocessed_data_dir = './data/preprocessed_data/'
split_sizes = [0.7, 0.15, 0.15]
assert sum(split_sizes) == 1.0
hf_data_id = 'alxxtexxr/SemEval2025-Task11-Dataset'
hf_data_config = lang + '_' + '_'.join([str(int(split_size * 100)) for split_size in split_sizes]) + '_stratify_v2'
print("Hugging Face dataset config:", hf_data_config)

Hugging Face dataset config: sun_70_15_15_stratify_v2


In [3]:
def set_seed(seed):
    # Set random seed for NumPy
    np.random.seed(seed)

    # Optionally set random seed for sklearn and Python's own random module
    random.seed(seed)

    # Set random seed for os
    os.environ['PYTHONHASHSEED'] = str(seed)

    print(f"Random seed set to: {seed}")

set_seed(seed)

Random seed set to: 42


# Data

## Preprocess Data

In [8]:
train_df = pd.read_csv(os.path.join(raw_data_dir, f'train/track_a/{lang}.csv'))
print("Training DF length:", len(train_df))
print()
train_df

Training DF length: 924



Unnamed: 0,id,text,Anger,Disgust,Fear,Joy,Sadness,Surprise
0,sun_train_track_a_00001,"mang dana ,uing can ngopi, kumaha ieu?",0,0,0,1,0,0
1,sun_train_track_a_00002,"Siap teu nanaon nuhun pisan tos nonton,kumaha ...",0,0,0,1,0,0
2,sun_train_track_a_00003,"ulin ka tasik mang, urang hayang asup kana kon...",0,0,0,1,0,0
3,sun_train_track_a_00004,Pokona bakalan sukses akang² ieu mah 👍👍👍,0,0,0,1,0,0
4,sun_train_track_a_00005,FIKSI mang dana kunn tara nyieun vidio deui ma...,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
919,sun_train_track_a_00920,bener mang teu karasa nya waktu duh kmh sehat?,0,0,0,0,1,1
920,sun_train_track_a_00921,Terang ieu chenel ka akang willi ti stt wa rer...,0,0,0,1,0,0
921,sun_train_track_a_00922,Sabaraha gereget jadi orang sunda Kamari urang...,0,0,0,1,0,0
922,sun_train_track_a_00923,"Yg nunggu vidio baru ngacung, ☝ Naha jarang up...",0,0,0,0,1,1


In [9]:
emotion_col_map = {
    'eng': { 'Anger': 'anger', 'Fear': 'fear', 'Joy': 'joy', 'Sadness': 'sad', 'Surprise': 'surprise' },
    'deu': { 'Anger': 'wut', 'Disgust': 'ekel', 'Fear': 'angst', 'Joy': 'freude', 'Sadness': 'trauer', 'Surprise': 'überraschung' },
    'sun': { 'Anger': 'marah', 'Disgust': 'jijik', 'Fear': 'takut', 'Joy': 'senang', 'Sadness': 'sedih', 'Surprise': 'terkejut' },
}
emotion_cols = list(emotion_col_map[lang].values())

neutral_emotion_map = {
    'eng': 'neutral',
    'deu': 'neutral',
    'sun': 'biasa',
}
neutral_emotion = neutral_emotion_map[lang]

# Rename emotion columns
train_df = train_df.rename(columns=emotion_col_map[lang])

# Create 'emotion' column by combining the positive emotions
train_df['emotion'] = train_df.apply(lambda row: ', '.join([col for col in emotion_cols if row[col] == 1]), axis=1)
train_df['emotion'] = train_df['emotion'].replace('', neutral_emotion) # Fill neutral emotion

# Create neutral emotion column
train_df[neutral_emotion_map[lang]] = (train_df['emotion'] == neutral_emotion).astype(int)
emotion_cols += [neutral_emotion]
print("Emotion columns:", emotion_cols)
train_df.head()

Emotion columns: ['marah', 'jijik', 'takut', 'senang', 'sedih', 'terkejut', 'biasa']


Unnamed: 0,id,text,marah,jijik,takut,senang,sedih,terkejut,emotion,biasa
0,sun_train_track_a_00001,"mang dana ,uing can ngopi, kumaha ieu?",0,0,0,1,0,0,senang,0
1,sun_train_track_a_00002,"Siap teu nanaon nuhun pisan tos nonton,kumaha ...",0,0,0,1,0,0,senang,0
2,sun_train_track_a_00003,"ulin ka tasik mang, urang hayang asup kana kon...",0,0,0,1,0,0,senang,0
3,sun_train_track_a_00004,Pokona bakalan sukses akang² ieu mah 👍👍👍,0,0,0,1,0,0,senang,0
4,sun_train_track_a_00005,FIKSI mang dana kunn tara nyieun vidio deui ma...,0,0,0,0,0,0,biasa,1


In [10]:
# Stratified split the data
def create_stratify_col(df):
    # Create 'stratify' column for stratified split
    df['stratify'] = df['emotion']

    # Identify classes with only one member
    single_class = df['emotion'].value_counts()[df['emotion'].value_counts() == 1].index

    # Assign a dummy value for the 'stratify' column for these classes
    df.loc[df['emotion'].isin(single_class), 'stratify'] = 'dummy'

create_stratify_col(train_df)

emotion_lt_7 = train_df['emotion'].value_counts()[(train_df['emotion'].value_counts() < 7)].index.tolist()
emotion_lt_7_cond = train_df['emotion'].isin(emotion_lt_7)
train_df_emotion_lt_7 = train_df[emotion_lt_7_cond]
train_df = train_df[~emotion_lt_7_cond]

# Split training DF into training and validation DFs
if len(split_sizes) == 3:
    train_df_, val_test_df = train_test_split(train_df[['text', 'emotion'] + emotion_cols],
                                        train_size=split_sizes[0],
                                        stratify=train_df['stratify'],
                                        random_state=seed)
    
    create_stratify_col(val_test_df)
    test_size = split_sizes[-1]/(split_sizes[1] + split_sizes[-1])
    val_df, test_df = train_test_split(val_test_df[['text', 'emotion'] + emotion_cols],
                                        test_size=test_size,
                                        stratify=val_test_df['stratify'],
                                        random_state=seed)
    
    print("Training DF length (splitted):", len(train_df_), "-->", train_df_.columns.tolist())
    print("Validation DF length:", len(val_df), "-->", val_df.columns.tolist())
    print("Testing DF length:", len(test_df), "-->", test_df.columns.tolist())

train_df_ = pd.concat([train_df_, train_df_emotion_lt_7[['text', 'emotion'] + emotion_cols]])

# dev_df = pd.read_csv(os.path.join(raw_data_dir, f'dev/track_a/{lang}_a.csv'))
# dev_df = dev_df.rename(columns=emotion_col_map[lang])
# dev_df['emotion'] = None
# dev_df = dev_df[['text', 'emotion'] + emotion_cols]
# print("Dev. DF length:", len(dev_df), "-->", dev_df.columns.tolist())

Training DF length (splitted): 595 --> ['text', 'emotion', 'marah', 'jijik', 'takut', 'senang', 'sedih', 'terkejut', 'biasa']
Validation DF length: 128 --> ['text', 'emotion', 'marah', 'jijik', 'takut', 'senang', 'sedih', 'terkejut', 'biasa']
Testing DF length: 128 --> ['text', 'emotion', 'marah', 'jijik', 'takut', 'senang', 'sedih', 'terkejut', 'biasa']


In [11]:
test_df['emotion'].value_counts()

emotion
senang                     66
senang, terkejut           19
sedih                      13
senang, sedih               7
biasa                       6
terkejut                    3
marah, jijik                2
sedih, terkejut             2
marah, sedih                2
jijik, senang               2
senang, sedih, terkejut     1
marah, jijik, terkejut      1
marah                       1
takut, sedih                1
takut, senang               1
marah, sedih, terkejut      1
Name: count, dtype: int64

## Save Preprocessed Data

In [12]:
# Save preprocessed data
save_dir = os.path.join(preprocessed_data_dir, 'track_a', hf_data_config)

!mkdir -p $save_dir

train_df_.to_csv(os.path.join(save_dir, 'train.csv'), index=False)
val_df.to_csv(os.path.join(save_dir, 'val.csv'), index=False)
test_df.to_csv(os.path.join(save_dir, 'test.csv'), index=False)
# dev_df.to_csv(os.path.join(save_dir, 'dev.csv'), index=False)

print("Saved to:", save_dir)

Saved to: ./data/preprocessed_data/track_a/sun_70_15_15_stratify_v2


## Upload Preprocessed Data to Hugging Face

In [13]:
hf_api = HfApi()
hf_api.upload_folder(
    repo_id=hf_data_id,
    repo_type='dataset',
    folder_path=save_dir,
    path_in_repo=os.path.join('preprocessed_data/track_a', hf_data_config),
)

CommitInfo(commit_url='https://huggingface.co/datasets/alxxtexxr/SemEval2025-Task11-Dataset/commit/7921baad1d35851e6b8dcfc877d9b838befdb0a0', commit_message='Upload folder using huggingface_hub', commit_description='', oid='7921baad1d35851e6b8dcfc877d9b838befdb0a0', pr_url=None, pr_revision=None, pr_num=None)