# Libraries

In [12]:
import os
import random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from huggingface_hub import HfApi

# Config

In [68]:
seed = 42
lang = 'eng'
raw_data_dir = './data/public_data/'
preprocessed_data_dir = './data/preprocessed_data/'
repo_id = 'alxxtexxr/SemEval2025-Task11-Dataset'
split_sizes = [0.7, 0.15, 0.15]
assert sum(split_sizes) == 1.0

In [69]:
def set_seed(seed):
    # Set random seed for NumPy
    np.random.seed(seed)

    # Optionally set random seed for sklearn and Python's own random module
    random.seed(seed)

    # Set random seed for os
    os.environ['PYTHONHASHSEED'] = str(seed)

    print(f"Random seed set to: {seed}")

set_seed(seed)

Random seed set to: 42


# Data

## Preprocess Data

In [70]:
train_df = pd.read_csv(os.path.join(raw_data_dir, f'train/track_a/{lang}.csv'))
print("Training DF length:", len(train_df))
print()
train_df

Training DF length: 2768



Unnamed: 0,id,text,Anger,Fear,Joy,Sadness,Surprise
0,eng_train_track_a_00001,But not very happy.,0,0,1,1,0
1,eng_train_track_a_00002,Well she's not gon na last the whole song like...,0,0,1,0,0
2,eng_train_track_a_00003,She sat at her Papa's recliner sofa only to mo...,0,0,0,0,0
3,eng_train_track_a_00004,"Yes, the Oklahoma city bombing.",1,1,0,1,1
4,eng_train_track_a_00005,They were dancing to Bolero.,0,0,1,0,0
...,...,...,...,...,...,...,...
2763,eng_train_track_a_02764,"""Yeah, but did you just find that?",0,1,0,0,1
2764,eng_train_track_a_02765,I did as little as possible with my right hand...,0,0,0,0,0
2765,eng_train_track_a_02766,"Okay that sucks, right?",1,0,0,1,0
2766,eng_train_track_a_02767,"The spark leaped through his body into mine, a...",0,1,0,0,1


In [72]:
emotion_col_map = {
    'eng': { 'Anger': 'anger', 'Fear': 'fear', 'Joy': 'joy', 'Sadness': 'sad', 'Surprise': 'surprise' },
    'deu': { 'Anger': 'wut', 'Disgust': 'ekel', 'Fear': 'angst', 'Joy': 'freude', 'Sadness': 'trauer', 'Surprise': 'überraschung' },
    'sun': { 'Anger': 'marah', 'Disgust': 'jijik', 'Fear': 'takut', 'Joy': 'senang', 'Sadness': 'sedih', 'Surprise': 'terkejut' },
}
emotion_cols = list(emotion_col_map[lang].values())

empty_emotion_map = {
    'eng': 'neutral',
    'deu': 'neutral',
    'sun': 'biasa',
}

# Rename emotion columns
train_df = train_df.rename(columns=emotion_col_map[lang])

# Create 'emotion' column by combining the positive emotions
train_df['emotion'] = train_df.apply(lambda row: ', '.join([col for col in emotion_cols if row[col] == 1]), axis=1)
train_df['emotion'] = train_df['emotion'].replace('', empty_emotion_map[lang]) # Fill empty emotion
# print(train_df['emotion'].value_counts())
# print()

def create_stratify_col(train_df):
    # Create 'stratify' column for stratified split
    train_df['stratify'] = train_df['emotion']

    # Identify classes with only one member
    single_class = train_df['emotion'].value_counts()[train_df['emotion'].value_counts() == 1].index

    # Assign a dummy value for the 'stratify' column for these classes
    train_df.loc[train_df['emotion'].isin(single_class), 'stratify'] = 'dummy'

create_stratify_col(train_df)

# Split training DF into training and validation DFs
if len(split_sizes) == 3:
    train_df_, val_test_df = train_test_split(train_df[['text', 'emotion'] + emotion_cols],
                                        train_size=split_sizes[0],
                                        stratify=train_df['stratify'],
                                        random_state=seed)
    
    create_stratify_col(val_test_df)
    test_size = split_sizes[-1]/(split_sizes[1] + split_sizes[-1])
    val_df, test_df = train_test_split(val_test_df[['text', 'emotion'] + emotion_cols],
                                        test_size=test_size,
                                        stratify=val_test_df['stratify'],
                                        random_state=seed)
    
    print("Training DF length (splitted):", len(train_df_), "-->", train_df_.columns.tolist())
    print("Validation DF length:", len(val_df), "-->", val_df.columns.tolist())
    print("Testing DF length:", len(test_df), "-->", test_df.columns.tolist())

dev_df = pd.read_csv(os.path.join(raw_data_dir, f'dev/track_a/{lang}_a.csv'))
dev_df = dev_df.rename(columns=emotion_col_map[lang])
dev_df['emotion'] = None
dev_df = dev_df[['text', 'emotion'] + emotion_cols]
print("Dev. DF length:", len(dev_df), "-->", dev_df.columns.tolist())

Training DF length (splitted): 1937 --> ['text', 'emotion', 'anger', 'fear', 'joy', 'sad', 'surprise']
Validation DF length: 415 --> ['text', 'emotion', 'anger', 'fear', 'joy', 'sad', 'surprise']
Testing DF length: 416 --> ['text', 'emotion', 'anger', 'fear', 'joy', 'sad', 'surprise']
Dev. DF length: 116 --> ['text', 'emotion', 'anger', 'fear', 'joy', 'sad', 'surprise']


## Save Preprocessed Data

In [73]:
# Save preprocessed data
save_dir_name = lang + '_' + '_'.join([str(int(split_size * 100)) for split_size in split_sizes])
save_dir = os.path.join(preprocessed_data_dir, save_dir_name)

!mkdir -p $save_dir

train_df_.to_csv(os.path.join(save_dir, 'train.csv'))
val_df.to_csv(os.path.join(save_dir, 'val.csv'))
test_df.to_csv(os.path.join(save_dir, 'test.csv'))

print("Saved to:", save_dir)

Saved to: ./data/preprocessed_data/eng_70_15_15


## Upload Preprocessed Data to Hugging Face

In [74]:
hf_api = HfApi()
hf_api.upload_folder(
    repo_id=repo_id,
    repo_type='dataset',
    folder_path=save_dir,
    path_in_repo=os.path.join('preprocessed_data', save_dir_name),
)

CommitInfo(commit_url='https://huggingface.co/datasets/alxxtexxr/SemEval2025-Task11-Dataset/commit/6a568e4fb13bc58382db8df946d00fa00c4f0e6e', commit_message='Upload folder using huggingface_hub', commit_description='', oid='6a568e4fb13bc58382db8df946d00fa00c4f0e6e', pr_url=None, pr_revision=None, pr_num=None)