# Libraries

In [20]:
import os
import random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from huggingface_hub import HfApi

# Config

In [21]:
seed = 42
lang = 'eng'
raw_data_dir = './data/public_data/'
preprocessed_data_dir = './data/preprocessed_data/'
repo_id = 'alxxtexxr/SemEval2025-Task11-Dataset'

In [14]:
def set_seed(seed):
    # Set random seed for NumPy
    np.random.seed(seed)

    # Optionally set random seed for sklearn and Python's own random module
    random.seed(seed)

    # Set random seed for os
    os.environ['PYTHONHASHSEED'] = str(seed)

    print(f"Random seed set to: {seed}")

set_seed(seed)

Random seed set to: 42


# Data

## Preprocess Data

In [15]:
train_df = pd.read_csv(os.path.join(raw_data_dir, f'train/track_a/{lang}.csv'))
print("Training DF length:", len(train_df))
print()
train_df

Training DF length: 2768



Unnamed: 0,id,text,Anger,Fear,Joy,Sadness,Surprise
0,eng_train_track_a_00001,But not very happy.,0,0,1,1,0
1,eng_train_track_a_00002,Well she's not gon na last the whole song like...,0,0,1,0,0
2,eng_train_track_a_00003,She sat at her Papa's recliner sofa only to mo...,0,0,0,0,0
3,eng_train_track_a_00004,"Yes, the Oklahoma city bombing.",1,1,0,1,1
4,eng_train_track_a_00005,They were dancing to Bolero.,0,0,1,0,0
...,...,...,...,...,...,...,...
2763,eng_train_track_a_02764,"""Yeah, but did you just find that?",0,1,0,0,1
2764,eng_train_track_a_02765,I did as little as possible with my right hand...,0,0,0,0,0
2765,eng_train_track_a_02766,"Okay that sucks, right?",1,0,0,1,0
2766,eng_train_track_a_02767,"The spark leaped through his body into mine, a...",0,1,0,0,1


In [17]:
emotion_col_map = {
    'eng': { 'Anger': 'anger', 'Fear': 'fear', 'Joy': 'joy', 'Sadness': 'sad', 'Surprise': 'surprise' },
    'deu': { 'Anger': 'wut', 'Disgust': 'ekel', 'Fear': 'angst', 'Joy': 'freude', 'Sadness': 'trauer', 'Surprise': 'überraschung' },
    'sun': { 'Anger': 'marah', 'Disgust': 'jijik', 'Fear': 'takut', 'Joy': 'senang', 'Sadness': 'sedih', 'Surprise': 'terkejut' },
}
emotion_cols = list(emotion_col_map[lang].values())

empty_emotion_map = {
    'eng': 'neutral',
    'deu': 'neutral',
    'sun': 'biasa',
}

# Rename emotion columns
train_df = train_df.rename(columns=emotion_col_map[lang])

# Create 'emotion' column by combining the positive emotions
train_df['emotion'] = train_df.apply(lambda row: ', '.join([col for col in emotion_cols if row[col] == 1]), axis=1)
train_df['emotion'] = train_df['emotion'].replace('', empty_emotion_map[lang]) # Fill empty emotion
print(train_df['emotion'].value_counts())
print()

# Create 'stratify' column for stratified split
train_df['stratify'] = train_df['emotion']

# Identify classes with only one member
single_class = train_df['emotion'].value_counts()[train_df['emotion'].value_counts() == 1].index

# Assign a dummy value for the 'stratify' column for these classes
train_df.loc[train_df['emotion'].isin(single_class), 'stratify'] = 'dummy'

# Split training DF into training and validation DFs
train_df_, val_df = train_test_split(train_df[['text', 'emotion'] + emotion_cols],
                                     test_size=0.2,
                                     stratify=train_df['stratify'],
                                     random_state=seed)
print("Training DF length (splitted):", len(train_df_))
print("Validation DF length:", len(val_df))

test_df = pd.read_csv(os.path.join(raw_data_dir, f'dev/track_a/{lang}_a.csv'))
test_df['emotion'] = None
test_df = test_df[['text', 'emotion']]
print("Testing DF length:", len(test_df))

emotion
joy                                429
fear, sad                          412
fear                               411
fear, surprise                     324
neutral                            239
sad                                133
fear, sad, surprise                124
surprise                           114
joy, surprise                      108
anger, fear, sad                    77
anger, fear                         66
anger                               54
anger, fear, sad, surprise          51
fear, joy                           49
anger, fear, surprise               42
fear, joy, surprise                 37
joy, sad                            25
anger, sad                          20
anger, surprise                     13
sad, surprise                       11
fear, joy, sad                      10
fear, joy, sad, surprise             5
joy, sad, surprise                   4
anger, joy                           3
anger, sad, surprise                 3
anger, fear, joy,

## Save Preprocessed Data

In [19]:
# Save preprocessed data
lang_dir = os.path.join(preprocessed_data_dir, lang)

!mkdir -p $lang_dir

train_df_.to_csv(os.path.join(lang_dir, 'train.csv'))
val_df.to_csv(os.path.join(lang_dir, 'val.csv'))
test_df.to_csv(os.path.join(lang_dir, 'test.csv'))

## Upload Preprocessed Data to Hugging Face

In [24]:
hf_api = HfApi()
hf_api.upload_folder(
    repo_id=repo_id,
    repo_type='dataset',
    folder_path=lang_dir,
    path_in_repo=os.path.join('preprocessed_data', lang),
)

CommitInfo(commit_url='https://huggingface.co/datasets/alxxtexxr/SemEval2025-Task11-Dataset/commit/22411f10528c0870454fb9efc25c6ecb3ca4e6d8', commit_message='Upload folder using huggingface_hub', commit_description='', oid='22411f10528c0870454fb9efc25c6ecb3ca4e6d8', pr_url=None, pr_revision=None, pr_num=None)