# Libraries

In [1]:
import os
import random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from huggingface_hub import HfApi

# Config

In [2]:
seed = 42
# lang = 'sun'
lang = 'sun_go_emotions'

# raw_data_dir = './data/public_data/'
raw_data_dir = './data/augmented_data/'
raw_data_path = os.path.join(raw_data_dir, f'train/track_a/sun_go_emotions_v2.csv')
preprocessed_data_dir = './data/preprocessed_data/'

split_sizes = [0.8, 0.1, 0.1]
assert sum(split_sizes) == 1.0
hf_data_id = 'alxxtexxr/SemEval2025-Task11-Dataset'
hf_data_config = lang + '_' + '_'.join([str(int(split_size * 100)) for split_size in split_sizes])
print("Hugging Face dataset config:", hf_data_config)

Hugging Face dataset config: sun_go_emotions_80_10_10


In [3]:
def set_seed(seed):
    # Set random seed for NumPy
    np.random.seed(seed)

    # Optionally set random seed for sklearn and Python's own random module
    random.seed(seed)

    # Set random seed for os
    os.environ['PYTHONHASHSEED'] = str(seed)

    print(f"Random seed set to: {seed}")

set_seed(seed)

Random seed set to: 42


# Data

## Preprocess Data

In [4]:
df = pd.read_csv(raw_data_path)
print("Raw DF length:", len(df))
print()
df

Raw DF length: 2495



Unnamed: 0,text,emotion,marah,jijik,takut,senang,sedih,terkejut,biasa,aug_go_emotions
0,Aya randa ker nguseup Pantun sunda meuni reuseup,senang,0,0,0,1,0,0,0,0
1,pastina ath mang ku abdi shere ken knu grup + SW,biasa,0,0,0,0,0,0,1,0
2,Mang Fiksi teh urang Majalengka oge?,senang,0,0,0,1,0,0,0,0
3,"mang dana ,uing can ngopi, kumaha ieu?",senang,0,0,0,1,0,0,0,0
4,Sedih Mang Ai kudu D Caritakeun Mah,"senang, sedih",0,0,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...
2490,"Nya, anjeun geus meakkeun dua jam.",biasa,0,0,0,0,0,0,1,1
2491,Kuring teu maca artikel anjeun tapi kuring nya...,biasa,0,0,0,0,0,0,1,1
2492,Aya sababaraha variasi sapanjang taun.,biasa,0,0,0,0,0,0,1,1
2493,Kuring mendakan sahanteuna hiji putri duyung u...,biasa,0,0,0,0,0,0,1,1


In [5]:
emotion_col_map = {
    'eng': { 'Anger': 'anger', 'Fear': 'fear', 'Joy': 'joy', 'Sadness': 'sad', 'Surprise': 'surprise' },
    'deu': { 'Anger': 'wut', 'Disgust': 'ekel', 'Fear': 'angst', 'Joy': 'freude', 'Sadness': 'trauer', 'Surprise': 'überraschung' },
    'sun': { 'Anger': 'marah', 'Disgust': 'jijik', 'Fear': 'takut', 'Joy': 'senang', 'Sadness': 'sedih', 'Surprise': 'terkejut' },
    'sun_go_emotions': { 'Anger': 'marah', 'Disgust': 'jijik', 'Fear': 'takut', 'Joy': 'senang', 'Sadness': 'sedih', 'Surprise': 'terkejut' },
}
emotion_cols = list(emotion_col_map[lang].values())

neutral_emotion_map = {
    'eng': 'neutral',
    'deu': 'neutral',
    'sun': 'biasa',
    'sun_go_emotions': 'biasa',
}
neutral_emotion = neutral_emotion_map[lang]

# Rename emotion columns
df = df.rename(columns=emotion_col_map[lang])

# Create 'emotion' column by combining the positive emotions
df['emotion'] = df.apply(lambda row: ', '.join([col for col in emotion_cols if row[col] == 1]), axis=1)
df['emotion'] = df['emotion'].replace('', neutral_emotion) # Fill neutral emotion

# Create neutral emotion column
df[neutral_emotion_map[lang]] = (df['emotion'] == neutral_emotion).astype(int)
emotion_cols += [neutral_emotion]
print("Emotion columns:", emotion_cols)

# Get augmentation columns
aug_columns = [col for col in df.columns if col.startswith('aug_')]
print("Augmentation columns:", aug_columns)

df.head()

Emotion columns: ['marah', 'jijik', 'takut', 'senang', 'sedih', 'terkejut', 'biasa']
Augmentation columns: ['aug_go_emotions']


Unnamed: 0,text,emotion,marah,jijik,takut,senang,sedih,terkejut,biasa,aug_go_emotions
0,Aya randa ker nguseup Pantun sunda meuni reuseup,senang,0,0,0,1,0,0,0,0
1,pastina ath mang ku abdi shere ken knu grup + SW,biasa,0,0,0,0,0,0,1,0
2,Mang Fiksi teh urang Majalengka oge?,senang,0,0,0,1,0,0,0,0
3,"mang dana ,uing can ngopi, kumaha ieu?",senang,0,0,0,1,0,0,0,0
4,Sedih Mang Ai kudu D Caritakeun Mah,"senang, sedih",0,0,0,1,1,0,0,0


In [6]:
df_non_aug = df[df['aug_go_emotions'] == 0]
df_aug = df[df['aug_go_emotions'] == 1]

print("DF non-augmented size:", len(df_non_aug))
print("DF augmented size:", len(df_aug))
print("DF total size:", len(df_aug) + len(df_non_aug))

DF non-augmented size: 924
DF augmented size: 1571
DF total size: 2495


In [8]:
# Stratified split the data
def create_stratify_col(df):
    # Create 'stratify' column for stratified split
    df.loc[:, 'stratify'] = df['emotion']

    # Identify classes with only one member
    single_class = df['emotion'].value_counts()[df['emotion'].value_counts() <= 2].index

    # Assign a dummy value for the 'stratify' column for these classes
    df.loc[df['emotion'].isin(single_class), 'stratify'] = 'dummy'

def stratify(df):
    create_stratify_col(df)

    emotion_lt_7 = df['emotion'].value_counts()[(df['emotion'].value_counts() < 7)].index.tolist()
    emotion_lt_7_cond = df['emotion'].isin(emotion_lt_7)
    df_emotion_lt_7 = df[emotion_lt_7_cond]
    df = df[~emotion_lt_7_cond]

    # Split training DF into training and validation DFs
    if len(split_sizes) == 3:
        train_df, val_test_df = train_test_split(df,
                                                train_size=split_sizes[0],
                                                stratify=df['stratify'],
                                                random_state=seed)
        create_stratify_col(val_test_df)
        test_size = split_sizes[-1]/(split_sizes[1] + split_sizes[-1])
        val_df, test_df = train_test_split(val_test_df,
                                        test_size=test_size,
                                        stratify=val_test_df['stratify'],
                                        random_state=seed)

    train_df = pd.concat([train_df, df_emotion_lt_7])

    return train_df, val_df, test_df

train_df_non_aug, val_df_non_aug, test_df_non_aug = stratify(df_non_aug)
print("Training DF non-augmented size:", len(train_df_non_aug), "-->", train_df_non_aug.columns.tolist())
print("Validation DF non-augmented size:", len(val_df_non_aug), "-->", val_df_non_aug.columns.tolist())
print("Testing DF non-augmented size:", len(test_df_non_aug), "-->", test_df_non_aug.columns.tolist())
print()

train_df_aug, val_df_aug, test_df_aug = stratify(df_aug)
print("Training DF augmented size:", len(train_df_aug), "-->", train_df_aug.columns.tolist())
print("Validation DF augmented size:", len(val_df_aug), "-->", val_df_aug.columns.tolist())
print("Testing DF augmented size:", len(test_df_aug), "-->", test_df_aug.columns.tolist())
print()

print(f"Training non-augmented : augmented ratio: {(len(train_df_non_aug)/(len(train_df_non_aug) + len(train_df_aug))*100):.2f}% : {(len(train_df_aug)/(len(train_df_non_aug) + len(train_df_aug))*100):.2f}%")
print(f"Validation non-augmented : augmented ratio: {(len(val_df_non_aug)/(len(val_df_non_aug) + len(val_df_aug))*100):.2f}% : {(len(val_df_aug)/(len(val_df_non_aug) + len(val_df_aug))*100):.2f}%")
print(f"Testing non-augmented : augmented ratio: {(len(test_df_non_aug)/(len(test_df_non_aug) + len(test_df_aug))*100):.2f}% : {(len(test_df_aug)/(len(test_df_non_aug) + len(test_df_aug))*100):.2f}%")
print()

train_df = pd.concat([train_df_non_aug, train_df_aug], axis=0)[['text', 'emotion'] + emotion_cols]
val_df = pd.concat([val_df_non_aug, val_df_aug], axis=0)[['text', 'emotion'] + emotion_cols]
test_df = pd.concat([test_df_non_aug, test_df_aug], axis=0)[['text', 'emotion'] + emotion_cols]
print("Training DF size:", len(train_df), "-->", train_df.columns.tolist())
print("Validation DF size:", len(val_df), "-->", val_df.columns.tolist())
print("Testing DF size:", len(test_df), "-->", test_df.columns.tolist())

Training DF non-augmented size: 753 --> ['text', 'emotion', 'marah', 'jijik', 'takut', 'senang', 'sedih', 'terkejut', 'biasa', 'aug_go_emotions', 'stratify']
Validation DF non-augmented size: 85 --> ['text', 'emotion', 'marah', 'jijik', 'takut', 'senang', 'sedih', 'terkejut', 'biasa', 'aug_go_emotions', 'stratify']
Testing DF non-augmented size: 86 --> ['text', 'emotion', 'marah', 'jijik', 'takut', 'senang', 'sedih', 'terkejut', 'biasa', 'aug_go_emotions', 'stratify']

Training DF augmented size: 1261 --> ['text', 'emotion', 'marah', 'jijik', 'takut', 'senang', 'sedih', 'terkejut', 'biasa', 'aug_go_emotions', 'stratify']
Validation DF augmented size: 155 --> ['text', 'emotion', 'marah', 'jijik', 'takut', 'senang', 'sedih', 'terkejut', 'biasa', 'aug_go_emotions', 'stratify']
Testing DF augmented size: 155 --> ['text', 'emotion', 'marah', 'jijik', 'takut', 'senang', 'sedih', 'terkejut', 'biasa', 'aug_go_emotions', 'stratify']

Training non-augmented : augmented ratio: 37.39% : 62.61%
Val

In [9]:
test_df['emotion'].value_counts()

emotion
senang                     44
sedih                      28
biasa                      26
terkejut                   25
marah                      25
takut                      24
jijik                      22
senang, terkejut           13
marah, jijik                9
senang, sedih               4
takut, sedih                4
marah, sedih                3
senang, sedih, terkejut     2
marah, jijik, terkejut      2
marah, takut                2
jijik, takut                2
marah, terkejut             2
sedih, terkejut             1
marah, sedih, terkejut      1
takut, terkejut             1
jijik, sedih                1
Name: count, dtype: int64

## Save Preprocessed Data

In [10]:
save_dir = os.path.join(preprocessed_data_dir, 'track_a', hf_data_config)

!mkdir -p $save_dir

train_df.to_csv(os.path.join(save_dir, 'train.csv'), index=False)
val_df.to_csv(os.path.join(save_dir, 'val.csv'), index=False)
test_df.to_csv(os.path.join(save_dir, 'test.csv'), index=False)
# dev_df.to_csv(os.path.join(save_dir, 'dev.csv'), index=False)

print("Saved to:", save_dir)

Saved to: ./data/preprocessed_data/track_a/sun_go_emotions_80_10_10


## Upload Preprocessed Data to Hugging Face

In [11]:
hf_api = HfApi()
hf_api.upload_folder(
    repo_id=hf_data_id,
    repo_type='dataset',
    folder_path=save_dir,
    path_in_repo=os.path.join('preprocessed_data/track_a', hf_data_config),
)

CommitInfo(commit_url='https://huggingface.co/datasets/alxxtexxr/SemEval2025-Task11-Dataset/commit/cfc0870a57a716a8d6052c6fd4fbee555c5ac30d', commit_message='Upload folder using huggingface_hub', commit_description='', oid='cfc0870a57a716a8d6052c6fd4fbee555c5ac30d', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/alxxtexxr/SemEval2025-Task11-Dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='alxxtexxr/SemEval2025-Task11-Dataset'), pr_revision=None, pr_num=None)