# Libraries

In [70]:
import os
import random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from huggingface_hub import HfApi

# Config

In [71]:
seed = 42
# lang = 'sun'
lang = 'sun_go_emotions'

# raw_data_dir = './data/public_data/'
raw_data_dir = './data/augmented_data/'
raw_data_path = os.path.join(raw_data_dir, f'train/track_a/sun_go_emotions_v2.csv')
preprocessed_data_dir = './data/preprocessed_data/'

split_sizes = [0.7, 0.15, 0.15]
assert sum(split_sizes) == 1.0
hf_data_id = 'alxxtexxr/SemEval2025-Task11-Dataset'
hf_data_config = lang + '_' + '_'.join([str(int(split_size * 100)) for split_size in split_sizes]) + '_v2'
print("Hugging Face dataset config:", hf_data_config)

Hugging Face dataset config: sun_go_emotions_70_15_15_v2


In [72]:
def set_seed(seed):
    # Set random seed for NumPy
    np.random.seed(seed)

    # Optionally set random seed for sklearn and Python's own random module
    random.seed(seed)

    # Set random seed for os
    os.environ['PYTHONHASHSEED'] = str(seed)

    print(f"Random seed set to: {seed}")

set_seed(seed)

Random seed set to: 42


# Data

## Preprocess Data

In [73]:
train_df = pd.read_csv(raw_data_path)
print("Training DF length:", len(train_df))
print()
train_df

Training DF length: 4066



Unnamed: 0,text,emotion,marah,jijik,takut,senang,sedih,terkejut,biasa,aug_go_emotions
0,Ngobrol ka kolotna meureun lamun eta pilihan.,biasa,0,0,0,0,0,0,1,0
1,Urang mah sok bingung ka batur anu ngomong nye...,"senang, sedih",0,0,0,1,1,0,0,0
2,"""Bedana teh lantaran abdi pantes narimana."" Ie...",marah,1,0,0,0,0,0,0,0
3,Aranjeun teh barudak teuweul anu ambek sabab m...,"marah, jijik",1,1,0,0,0,0,0,0
4,Anjeun teu nyaho naon fakta teh?,biasa,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...
4061,"Nya, anjeun geus meakkeun dua jam.",biasa,0,0,0,0,0,0,1,0
4062,Kuring teu maca artikel anjeun tapi kuring nya...,biasa,0,0,0,0,0,0,1,0
4063,Aya sababaraha variasi sapanjang taun.,biasa,0,0,0,0,0,0,1,0
4064,Kuring mendakan sahanteuna hiji putri duyung u...,biasa,0,0,0,0,0,0,1,0


In [74]:
emotion_col_map = {
    'eng': { 'Anger': 'anger', 'Fear': 'fear', 'Joy': 'joy', 'Sadness': 'sad', 'Surprise': 'surprise' },
    'deu': { 'Anger': 'wut', 'Disgust': 'ekel', 'Fear': 'angst', 'Joy': 'freude', 'Sadness': 'trauer', 'Surprise': 'überraschung' },
    'sun': { 'Anger': 'marah', 'Disgust': 'jijik', 'Fear': 'takut', 'Joy': 'senang', 'Sadness': 'sedih', 'Surprise': 'terkejut' },
    'sun_go_emotions': { 'Anger': 'marah', 'Disgust': 'jijik', 'Fear': 'takut', 'Joy': 'senang', 'Sadness': 'sedih', 'Surprise': 'terkejut' },
}
emotion_cols = list(emotion_col_map[lang].values())

neutral_emotion_map = {
    'eng': 'neutral',
    'deu': 'neutral',
    'sun': 'biasa',
    'sun_go_emotions': 'biasa',
}
neutral_emotion = neutral_emotion_map[lang]

# Rename emotion columns
train_df = train_df.rename(columns=emotion_col_map[lang])

# Create 'emotion' column by combining the positive emotions
train_df['emotion'] = train_df.apply(lambda row: ', '.join([col for col in emotion_cols if row[col] == 1]), axis=1)
train_df['emotion'] = train_df['emotion'].replace('', neutral_emotion) # Fill neutral emotion

# Create neutral emotion column
train_df[neutral_emotion_map[lang]] = (train_df['emotion'] == neutral_emotion).astype(int)
emotion_cols += [neutral_emotion]
print("Emotion columns:", emotion_cols)

# Get augmentation columns
aug_columns = [col for col in train_df.columns if col.startswith('aug_')]
print("Augmentation columns:", aug_columns)

train_df.head()

Emotion columns: ['marah', 'jijik', 'takut', 'senang', 'sedih', 'terkejut', 'biasa']
Augmentation columns: ['aug_go_emotions']


Unnamed: 0,text,emotion,marah,jijik,takut,senang,sedih,terkejut,biasa,aug_go_emotions
0,Ngobrol ka kolotna meureun lamun eta pilihan.,biasa,0,0,0,0,0,0,1,0
1,Urang mah sok bingung ka batur anu ngomong nye...,"senang, sedih",0,0,0,1,1,0,0,0
2,"""Bedana teh lantaran abdi pantes narimana."" Ie...",marah,1,0,0,0,0,0,0,0
3,Aranjeun teh barudak teuweul anu ambek sabab m...,"marah, jijik",1,1,0,0,0,0,0,0
4,Anjeun teu nyaho naon fakta teh?,biasa,0,0,0,0,0,0,1,0


In [78]:
# Stratified split the data
def create_stratify_col(df):
    # Create 'stratify' column for stratified split
    df['stratify'] = df['emotion']

    # Identify classes with only one member
    single_class = df['emotion'].value_counts()[df['emotion'].value_counts() == 1].index

    # Assign a dummy value for the 'stratify' column for these classes
    df.loc[df['emotion'].isin(single_class), 'stratify'] = 'dummy'

create_stratify_col(train_df)

emotion_lt_7 = train_df['emotion'].value_counts()[(train_df['emotion'].value_counts() < 7)].index.tolist()
emotion_lt_7_cond = train_df['emotion'].isin(emotion_lt_7)
train_df_emotion_lt_7 = train_df[emotion_lt_7_cond]
train_df = train_df[~emotion_lt_7_cond]

# Divide DF into non-augmented and augmented DF
train_df_non_aug = train_df[(train_df[aug_columns] == 0).all(axis=1)]
train_df_aug = train_df[(train_df[aug_columns] == 1).all(axis=1)]

# Create testing DF from non-augmented DF
test_size = split_sizes[-1] * (len(train_df_emotion_lt_7) + len(train_df))
train_df_non_aug_, test_df = train_test_split(train_df_non_aug[['text', 'emotion'] + emotion_cols],
                                              test_size=test_size,
                                              stratify=train_df_non_aug['stratify'],
                                              random_state=seed)

train_df_ = pd.concat(train_df_non_aug_, train_df_aug)

create_stratify_col(train_df_)

# Split training DF into training and validation DFs
train_df__, val_df = train_test_split(train_df_[['text', 'emotion'] + emotion_cols],
                                      test_size=test_size,
                                      stratify=train_df_['stratify'],
                                      random_state=seed)

# create_stratify_col(val_test_df)
# test_size = split_sizes[-1]/(split_sizes[1] + split_sizes[-1])
# val_df, test_df = train_test_split(val_test_df[['text', 'emotion'] + emotion_cols],
#                                     test_size=test_size,
#                                     stratify=val_test_df['stratify'],
#                                     random_state=seed)

print("Training DF length (splitted):", len(train_df__), "-->", train_df__.columns.tolist())
print("Validation DF length:", len(val_df), "-->", val_df.columns.tolist())
print("Testing DF length:", len(test_df), "-->", test_df.columns.tolist())

train_df__ = pd.concat([train_df__, train_df_emotion_lt_7[['text', 'emotion'] + emotion_cols]])

# dev_df = pd.read_csv(os.path.join(raw_data_dir, f'dev/track_a/{lang}_a.csv'))
# dev_df = dev_df.rename(columns=emotion_col_map[lang])
# dev_df['emotion'] = None
# dev_df = dev_df[['text', 'emotion'] + emotion_cols]
# print("Dev. DF length:", len(dev_df), "-->", dev_df.columns.tolist())

600


SyntaxError: 'return' outside function (3295533728.py, line 28)

In [23]:
test_df['emotion'].value_counts()

emotion
marah                      32
jijik                      27
takut                      25
senang                     22
sedih                      21
biasa                      19
terkejut                   19
marah, jijik               13
senang, terkejut           12
takut, sedih                5
senang, sedih               5
marah, sedih                5
jijik, takut                3
marah, terkejut             2
marah, takut                2
takut, terkejut             2
jijik, sedih                2
sedih, terkejut             1
jijik, terkejut             1
marah, jijik, terkejut      1
marah, sedih, terkejut      1
senang, sedih, terkejut     1
jijik, senang               1
Name: count, dtype: int64

## Save Preprocessed Data

In [24]:
save_dir = os.path.join(preprocessed_data_dir, 'track_a', hf_data_config)

!mkdir -p $save_dir

train_df_.to_csv(os.path.join(save_dir, 'train.csv'), index=False)
val_df.to_csv(os.path.join(save_dir, 'val.csv'), index=False)
test_df.to_csv(os.path.join(save_dir, 'test.csv'), index=False)
# dev_df.to_csv(os.path.join(save_dir, 'dev.csv'), index=False)

print("Saved to:", save_dir)

Saved to: ./data/preprocessed_data/track_a/sun_go_emotions_70_15_15_balanced


## Upload Preprocessed Data to Hugging Face

In [25]:
hf_api = HfApi()
hf_api.upload_folder(
    repo_id=hf_data_id,
    repo_type='dataset',
    folder_path=save_dir,
    path_in_repo=os.path.join('preprocessed_data/track_a', hf_data_config),
)

CommitInfo(commit_url='https://huggingface.co/datasets/alxxtexxr/SemEval2025-Task11-Dataset/commit/9dd9c464be247d43d5843313f0596bcca9c4d62d', commit_message='Upload folder using huggingface_hub', commit_description='', oid='9dd9c464be247d43d5843313f0596bcca9c4d62d', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/alxxtexxr/SemEval2025-Task11-Dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='alxxtexxr/SemEval2025-Task11-Dataset'), pr_revision=None, pr_num=None)