# Libraries

In [103]:
import os
import random
import numpy as np
import pandas as pd
import re
import unicodedata
from sklearn.model_selection import train_test_split
from huggingface_hub import HfApi

# Config

In [104]:
seed = 42
# lang = 'sun'
lang = 'sun_go_emotions'

raw_data_path = 'data/augmented_data/train/track_a/sun_go_emotions_back_translated.csv'
preprocessed_data_dir = './data/preprocessed_data/'

split_sizes = [0.7, 0.15, 0.15]
assert len(split_sizes) == 3 and sum(split_sizes) == 1.0

hf_data_id = 'alxxtexxr/SemEval2025-Task11-Dataset'
hf_data_config = lang + '_' + '_'.join([str(int(split_size * 100)) for split_size in split_sizes]) + '_back_translated'
print("Hugging Face dataset config:", hf_data_config)

Hugging Face dataset config: sun_go_emotions_70_15_15_back_translated


In [105]:
def set_seed(seed):
    # Set random seed for NumPy
    np.random.seed(seed)

    # Optionally set random seed for sklearn and Python's own random module
    random.seed(seed)

    # Set random seed for os
    os.environ['PYTHONHASHSEED'] = str(seed)

    print(f"Random seed set to: {seed}")

set_seed(seed)

Random seed set to: 42


# Data

## Preprocess Data

In [106]:
df = pd.read_csv(raw_data_path)
print("Raw DF length:", len(df))
print()
df

Raw DF length: 2494



Unnamed: 0,id,text,marah,jijik,takut,senang,sedih,terkejut,aug_go_emotions,biasa,emotion
0,sun_train_track_a_00001,"Kumaha engke carita fiksi ka komunitas, salam ...",0,0,0,1,0,1,0,,
1,sun_train_track_a_00002,Tapi anak domba eta sigana lain aing nu boga. ...,0,0,0,1,0,1,0,,
2,sun_train_track_a_00003,Aduh kang kunaon kudu penting diunggah mah,0,0,0,1,0,0,0,,
3,sun_train_track_a_00004,"Pokokna hapunten ageung, urang terus ningkatke...",0,0,0,1,1,0,0,,
4,sun_train_track_a_00005,Naha manehna teu era?? Na memang tukang nyieun...,0,0,0,1,0,1,0,,
...,...,...,...,...,...,...,...,...,...,...,...
2489,,"Nya, anjeun geus meakkeun dua jam.",0,0,0,0,0,0,1,1.0,
2490,,Kuring teu maca artikel anjeun tapi kuring nya...,0,0,0,0,0,0,1,1.0,
2491,,Aya sababaraha variasi sapanjang taun.,0,0,0,0,0,0,1,1.0,
2492,,Kuring mendakan sahanteuna hiji putri duyung u...,0,0,0,0,0,0,1,1.0,


In [107]:
emotion_col_map = {
    'eng': { 'Anger': 'anger', 'Fear': 'fear', 'Joy': 'joy', 'Sadness': 'sad', 'Surprise': 'surprise' },
    'deu': { 'Anger': 'wut', 'Disgust': 'ekel', 'Fear': 'angst', 'Joy': 'freude', 'Sadness': 'trauer', 'Surprise': 'überraschung' },
    'sun': { 'Anger': 'marah', 'Disgust': 'jijik', 'Fear': 'takut', 'Joy': 'senang', 'Sadness': 'sedih', 'Surprise': 'terkejut' },
    'sun_go_emotions': { 'Anger': 'marah', 'Disgust': 'jijik', 'Fear': 'takut', 'Joy': 'senang', 'Sadness': 'sedih', 'Surprise': 'terkejut' },
}
emotion_cols = list(emotion_col_map[lang].values())

neutral_emotion_map = {
    'eng': 'neutral',
    'deu': 'neutral',
    'sun': 'biasa',
    'sun_go_emotions': 'biasa',
}
neutral_emotion = neutral_emotion_map[lang]

# Rename emotion columns
df = df.rename(columns=emotion_col_map[lang])

# Create 'emotion' column by combining the positive emotions
df['emotion'] = df.apply(lambda row: ', '.join([col for col in emotion_cols if row[col] == 1]), axis=1)
df['emotion'] = df['emotion'].replace('', neutral_emotion) # Fill neutral emotion

# Create neutral emotion column
df[neutral_emotion_map[lang]] = (df['emotion'] == neutral_emotion).astype(int)
emotion_cols += [neutral_emotion]
print("Emotion columns:", emotion_cols)

# Get augmentation columns
aug_columns = [col for col in df.columns if col.startswith('aug_')]
print("Augmentation columns:", aug_columns)

df.head()

Emotion columns: ['marah', 'jijik', 'takut', 'senang', 'sedih', 'terkejut', 'biasa']
Augmentation columns: ['aug_go_emotions']


Unnamed: 0,id,text,marah,jijik,takut,senang,sedih,terkejut,aug_go_emotions,biasa,emotion
0,sun_train_track_a_00001,"Kumaha engke carita fiksi ka komunitas, salam ...",0,0,0,1,0,1,0,0,"senang, terkejut"
1,sun_train_track_a_00002,Tapi anak domba eta sigana lain aing nu boga. ...,0,0,0,1,0,1,0,0,"senang, terkejut"
2,sun_train_track_a_00003,Aduh kang kunaon kudu penting diunggah mah,0,0,0,1,0,0,0,0,senang
3,sun_train_track_a_00004,"Pokokna hapunten ageung, urang terus ningkatke...",0,0,0,1,1,0,0,0,"senang, sedih"
4,sun_train_track_a_00005,Naha manehna teu era?? Na memang tukang nyieun...,0,0,0,1,0,1,0,0,"senang, terkejut"


In [108]:
def contains_non_ascii(text):
    try:
        text.encode('ascii')
    except UnicodeEncodeError:
        return True
    return False

# def remove_emojis_and_symbols(text):
#     # Regex pattern to match emojis and symbols with variation selectors
#     emoji_pattern = re.compile(
#         "[\U00010000-\U0010ffff\U00002000-\U00002BFF\U00002702-\U000027B0]+", 
#         flags=re.UNICODE)
#     # Remove emojis
#     text = emoji_pattern.sub('', text)
#     # Remove any non-ASCII symbols (except regular punctuation)
#     text = ''.join(char for char in text if ord(char) < 128 and unicodedata.category(char) != 'Mn')
#     return text

# def contains_non_ascii(text):
#     # Remove emojis and symbols first
#     text = remove_emojis_and_symbols(text)
#     try:
#         # Check if the remaining text contains non-ASCII characters
#         text.encode('ascii')
#     except UnicodeEncodeError:
#         return True
#     return False

print("Total data with non-ASCII chars:", int(df['text'].apply(contains_non_ascii).sum()))

Total data with non-ASCII chars: 0


In [109]:
# Stratified split the data
def create_stratify_col(df):
    # Create 'stratify' column for stratified split
    df['stratify'] = df['emotion']

    # Identify classes with only one member
    single_class = df['emotion'].value_counts()[df['emotion'].value_counts() == 1].index

    # Assign a dummy value for the 'stratify' column for these classes
    df.loc[df['emotion'].isin(single_class), 'stratify'] = 'dummy'

create_stratify_col(df)

emotion_lt_7 = df['emotion'].value_counts()[(df['emotion'].value_counts() < 7)].index.tolist()
emotion_lt_7_cond = df['emotion'].isin(emotion_lt_7)
df_emotion_lt_7 = df[emotion_lt_7_cond]
df_emotion_gte_7 = df[~emotion_lt_7_cond]

# Split training DF into training and validation DFs
if len(split_sizes) == 3:
    train_df, val_test_df = train_test_split(df_emotion_gte_7[['text', 'emotion'] + emotion_cols],
                                             train_size=split_sizes[0],
                                             stratify=df_emotion_gte_7['stratify'],
                                             random_state=seed)
    
    create_stratify_col(val_test_df)
    test_size = split_sizes[-1]/(split_sizes[1] + split_sizes[-1])
    val_df, test_df = train_test_split(val_test_df[['text', 'emotion'] + emotion_cols],
                                        test_size=test_size,
                                        stratify=val_test_df['stratify'],
                                        random_state=seed)

train_df = pd.concat([train_df, df_emotion_lt_7[['text', 'emotion'] + emotion_cols]])

print("Training DF size:", len(train_df), "-->", train_df.columns.tolist())
print("Validation DF size:", len(val_df), "-->", val_df.columns.tolist())
print("Testing DF size:", len(test_df), "-->", test_df.columns.tolist())

# dev_df = pd.read_csv(os.path.join(raw_data_dir, f'dev/track_a/{lang}_a.csv'))
# dev_df = dev_df.rename(columns=emotion_col_map[lang])
# dev_df['emotion'] = None
# dev_df = dev_df[['text', 'emotion'] + emotion_cols]
# print("Dev. DF length:", len(dev_df), "-->", dev_df.columns.tolist())

Training DF size: 1764 --> ['text', 'emotion', 'marah', 'jijik', 'takut', 'senang', 'sedih', 'terkejut', 'biasa']
Validation DF size: 365 --> ['text', 'emotion', 'marah', 'jijik', 'takut', 'senang', 'sedih', 'terkejut', 'biasa']
Testing DF size: 365 --> ['text', 'emotion', 'marah', 'jijik', 'takut', 'senang', 'sedih', 'terkejut', 'biasa']


In [110]:
# df_non_aug = df[df['aug_go_emotions'] == 0]
# df_aug = df[df['aug_go_emotions'] == 1]

# print("DF non-augmented size:", len(df_non_aug))
# print("DF augmented size:", len(df_aug))
# print("DF total size:", len(df_aug) + len(df_non_aug))

In [111]:
# # Stratified split the data
# def create_stratify_col(df):
#     # Create 'stratify' column for stratified split
#     df.loc[:, 'stratify'] = df['emotion']

#     # Identify classes with only one member
#     single_class = df['emotion'].value_counts()[df['emotion'].value_counts() <= 2].index

#     # Assign a dummy value for the 'stratify' column for these classes
#     df.loc[df['emotion'].isin(single_class), 'stratify'] = 'dummy'

# def stratify(df):
#     create_stratify_col(df)

#     emotion_lt_7 = df['emotion'].value_counts()[(df['emotion'].value_counts() < 7)].index.tolist()
#     emotion_lt_7_cond = df['emotion'].isin(emotion_lt_7)
#     df_emotion_lt_7 = df[emotion_lt_7_cond]
#     df = df[~emotion_lt_7_cond]

#     # Split training DF into training and validation DFs
#     if len(split_sizes) == 3:
#         train_df, val_test_df = train_test_split(df,
#                                                 train_size=split_sizes[0],
#                                                 stratify=df['stratify'],
#                                                 random_state=seed)
#         create_stratify_col(val_test_df)
#         test_size = split_sizes[-1]/(split_sizes[1] + split_sizes[-1])
#         val_df, test_df = train_test_split(val_test_df,
#                                         test_size=test_size,
#                                         stratify=val_test_df['stratify'],
#                                         random_state=seed)

#     train_df = pd.concat([train_df, df_emotion_lt_7])

#     return train_df, val_df, test_df

# train_df_non_aug, val_df_non_aug, test_df_non_aug = stratify(df_non_aug)
# print("Training DF non-augmented size:", len(train_df_non_aug), "-->", train_df_non_aug.columns.tolist())
# print("Validation DF non-augmented size:", len(val_df_non_aug), "-->", val_df_non_aug.columns.tolist())
# print("Testing DF non-augmented size:", len(test_df_non_aug), "-->", test_df_non_aug.columns.tolist())
# print()

# train_df_aug, val_df_aug, test_df_aug = stratify(df_aug)
# print("Training DF augmented size:", len(train_df_aug), "-->", train_df_aug.columns.tolist())
# print("Validation DF augmented size:", len(val_df_aug), "-->", val_df_aug.columns.tolist())
# print("Testing DF augmented size:", len(test_df_aug), "-->", test_df_aug.columns.tolist())
# print()

# print(f"Training non-augmented : augmented ratio: {(len(train_df_non_aug)/(len(train_df_non_aug) + len(train_df_aug))*100):.2f}% : {(len(train_df_aug)/(len(train_df_non_aug) + len(train_df_aug))*100):.2f}%")
# print(f"Validation non-augmented : augmented ratio: {(len(val_df_non_aug)/(len(val_df_non_aug) + len(val_df_aug))*100):.2f}% : {(len(val_df_aug)/(len(val_df_non_aug) + len(val_df_aug))*100):.2f}%")
# print(f"Testing non-augmented : augmented ratio: {(len(test_df_non_aug)/(len(test_df_non_aug) + len(test_df_aug))*100):.2f}% : {(len(test_df_aug)/(len(test_df_non_aug) + len(test_df_aug))*100):.2f}%")
# print()

# train_df = pd.concat([train_df_non_aug, train_df_aug], axis=0)[['text', 'emotion'] + emotion_cols]
# val_df = pd.concat([val_df_non_aug, val_df_aug], axis=0)[['text', 'emotion'] + emotion_cols]
# test_df = pd.concat([test_df_non_aug, test_df_aug], axis=0)[['text', 'emotion'] + emotion_cols]
# print("Training DF size:", len(train_df), "-->", train_df.columns.tolist())
# print("Validation DF size:", len(val_df), "-->", val_df.columns.tolist())
# print("Testing DF size:", len(test_df), "-->", test_df.columns.tolist())

In [112]:
test_df['emotion'].value_counts()

emotion
senang                     65
sedih                      40
terkejut                   38
biasa                      38
takut                      37
marah                      37
jijik                      33
senang, terkejut           20
marah, jijik               14
senang, sedih               7
takut, sedih                6
marah, sedih                4
jijik, takut                3
jijik, sedih                3
sedih, terkejut             3
jijik, terkejut             2
takut, terkejut             2
senang, sedih, terkejut     2
marah, terkejut             2
jijik, senang               2
marah, takut                2
marah, jijik, terkejut      2
marah, sedih, terkejut      1
takut, senang, terkejut     1
takut, senang               1
Name: count, dtype: int64

## Save Preprocessed Data

In [113]:
save_dir = os.path.join(preprocessed_data_dir, 'track_a', hf_data_config)

!mkdir -p $save_dir

train_df.to_csv(os.path.join(save_dir, 'train.csv'), index=False)
val_df.to_csv(os.path.join(save_dir, 'val.csv'), index=False)
test_df.to_csv(os.path.join(save_dir, 'test.csv'), index=False)
# dev_df.to_csv(os.path.join(save_dir, 'dev.csv'), index=False)

print("Saved to:", save_dir)

Saved to: ./data/preprocessed_data/track_a/sun_go_emotions_70_15_15_back_translated


## Upload Preprocessed Data to Hugging Face

In [114]:
hf_api = HfApi()
hf_api.upload_folder(
    repo_id=hf_data_id,
    repo_type='dataset',
    folder_path=save_dir,
    path_in_repo=os.path.join('preprocessed_data/track_a', hf_data_config),
)

CommitInfo(commit_url='https://huggingface.co/datasets/alxxtexxr/SemEval2025-Task11-Dataset/commit/4799d120d8565340bc80cd042c36b0501012e520', commit_message='Upload folder using huggingface_hub', commit_description='', oid='4799d120d8565340bc80cd042c36b0501012e520', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/alxxtexxr/SemEval2025-Task11-Dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='alxxtexxr/SemEval2025-Task11-Dataset'), pr_revision=None, pr_num=None)