# Libraries

In [1]:
import os
import random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from huggingface_hub import HfApi

# Config

In [2]:
seed = 42
lang = 'eng'
raw_data_dir = './data/public_data/'
preprocessed_data_dir = './data/preprocessed_data/'
repo_id = 'alxxtexxr/SemEval2025-Task11-Dataset'
split_sizes = [0.7, 0.15, 0.15]
assert sum(split_sizes) == 1.0

In [3]:
def set_seed(seed):
    # Set random seed for NumPy
    np.random.seed(seed)

    # Optionally set random seed for sklearn and Python's own random module
    random.seed(seed)

    # Set random seed for os
    os.environ['PYTHONHASHSEED'] = str(seed)

    print(f"Random seed set to: {seed}")

set_seed(seed)

Random seed set to: 42


# Data

## Preprocess Data

In [4]:
train_df = pd.read_csv(os.path.join(raw_data_dir, f'train/track_b/{lang}.csv'))
print("Training DF length:", len(train_df))
print()
train_df

Training DF length: 2768



Unnamed: 0,id,text,Anger,Fear,Joy,Sadness,Surprise
0,eng_train_track_b_00001,My Spanish language skills were fairly basic.,0,1,0,0,0
1,eng_train_track_b_00002,Don't mess with my orange juice.,2,0,0,0,0
2,eng_train_track_b_00003,"So, I am from a science background and analyze...",0,0,0,0,0
3,eng_train_track_b_00004,I was writing away.,0,0,1,0,0
4,eng_train_track_b_00005,Apparently it wasn't as life threatening as I ...,0,1,0,1,0
...,...,...,...,...,...,...,...
2763,eng_train_track_b_02764,"""That's were those people went missing"".",0,3,0,1,2
2764,eng_train_track_b_02765,"`` Out of my mouth came, `` I didn't see Creep...",0,2,0,0,1
2765,eng_train_track_b_02766,My muscles all relax With the feeling of an em...,0,0,2,0,0
2766,eng_train_track_b_02767,I was going up to see my Avs play the Red Wings.,0,0,2,0,0


In [20]:
emotion_col_map = {
    'eng': { 'Anger': 'anger', 'Fear': 'fear', 'Joy': 'joy', 'Sadness': 'sad', 'Surprise': 'surprise' },
    'deu': { 'Anger': 'wut', 'Disgust': 'ekel', 'Fear': 'angst', 'Joy': 'freude', 'Sadness': 'trauer', 'Surprise': 'überraschung' },
    'sun': { 'Anger': 'marah', 'Disgust': 'jijik', 'Fear': 'takut', 'Joy': 'senang', 'Sadness': 'sedih', 'Surprise': 'terkejut' },
}
emotion_cols = list(emotion_col_map[lang].values())

empty_emotion_map = {
    'eng': 'neutral',
    'deu': 'neutral',
    'sun': 'biasa',
}

# Rename emotion columns
train_df = train_df.rename(columns=emotion_col_map[lang])

# # Create 'emotion' column by combining the positive emotions
# train_df['emotion'] = train_df.apply(lambda row: ', '.join([col for col in emotion_cols if row[col] == 1]), axis=1)
# train_df['emotion'] = train_df['emotion'].replace('', empty_emotion_map[lang]) # Fill empty emotion
# # print(train_df['emotion'].value_counts())
# # print()

In [64]:
# One-hot encode the emotion
one_hot_emotion_df = pd.get_dummies(train_df[emotion_cols], columns=emotion_cols, prefix=emotion_cols).astype(int)
one_hot_emotion_cols = one_hot_emotion_df.columns.tolist()
print("One-hot emotion columns:", one_hot_emotion_cols)
# print(one_hot_emotion_cols)
train_df_one_hot = pd.concat([train_df, one_hot_emotion_df], axis=1)
train_df_one_hot.head()

One-hot emotion columns: ['anger_0', 'anger_1', 'anger_2', 'anger_3', 'fear_0', 'fear_1', 'fear_2', 'fear_3', 'joy_0', 'joy_1', 'joy_2', 'joy_3', 'sad_0', 'sad_1', 'sad_2', 'sad_3', 'surprise_0', 'surprise_1', 'surprise_2', 'surprise_3']


Unnamed: 0,id,text,anger,fear,joy,sad,surprise,anger_0,anger_1,anger_2,...,joy_2,joy_3,sad_0,sad_1,sad_2,sad_3,surprise_0,surprise_1,surprise_2,surprise_3
0,eng_train_track_b_00001,My Spanish language skills were fairly basic.,0,1,0,0,0,1,0,0,...,0,0,1,0,0,0,1,0,0,0
1,eng_train_track_b_00002,Don't mess with my orange juice.,2,0,0,0,0,0,0,1,...,0,0,1,0,0,0,1,0,0,0
2,eng_train_track_b_00003,"So, I am from a science background and analyze...",0,0,0,0,0,1,0,0,...,0,0,1,0,0,0,1,0,0,0
3,eng_train_track_b_00004,I was writing away.,0,0,1,0,0,1,0,0,...,0,0,1,0,0,0,1,0,0,0
4,eng_train_track_b_00005,Apparently it wasn't as life threatening as I ...,0,1,0,1,0,1,0,0,...,0,0,0,1,0,0,1,0,0,0


In [74]:
def create_stratify_col(df):
    # Create 'stratify' column for stratified split
    # train_df_one_hot['stratify'] = train_df_one_hot.apply(lambda row: ''.join(row[one_hot_emotion_cols].astype(str)), axis=1)
    df['stratify'] = (df[emotion_cols] > 0).astype(int).apply(lambda row: ''.join(row.astype(str)), axis=1)
    # print(df['stratify'].value_counts())

    # Identify classes with only one member
    single_class = df['stratify'].value_counts()[df['stratify'].value_counts() == 1].index

    # Assign a dummy value for the 'stratify' column for these classes
    df.loc[df['stratify'].isin(single_class), 'stratify'] = 'dummy'

create_stratify_col(train_df_one_hot)
# print(train_df_one_hot['stratify'].value_counts())

# Split training DF into training and validation DFs
if len(split_sizes) == 3:
    train_df_, val_test_df = train_test_split(train_df_one_hot[['text'] + emotion_cols + one_hot_emotion_cols],
                                        train_size=split_sizes[0],
                                        stratify=train_df_one_hot['stratify'],
                                        random_state=seed)
    train_df_ = train_df_[['text'] + one_hot_emotion_cols]
    
    create_stratify_col(val_test_df)
    test_size = split_sizes[-1]/(split_sizes[1] + split_sizes[-1])
    val_df, test_df = train_test_split(val_test_df[['text'] + one_hot_emotion_cols],
                                        test_size=test_size,
                                        stratify=val_test_df['stratify'],
                                        random_state=seed)
    
    print("Training DF length (splitted):", len(train_df_), "-->", train_df_.columns.tolist())
    print("Validation DF length:", len(val_df), "-->", val_df.columns.tolist())
    print("Testing DF length:", len(test_df), "-->", test_df.columns.tolist())

# dev_df = pd.read_csv(os.path.join(raw_data_dir, f'dev/track_a/{lang}_a.csv'))
# dev_df = dev_df.rename(columns=emotion_col_map[lang])
# dev_df['emotion'] = None
# dev_df = dev_df[['text', 'emotion'] + emotion_cols]
# print("Dev. DF length:", len(dev_df), "-->", dev_df.columns.tolist())

Training DF length (splitted): 1937 --> ['text', 'anger_0', 'anger_1', 'anger_2', 'anger_3', 'fear_0', 'fear_1', 'fear_2', 'fear_3', 'joy_0', 'joy_1', 'joy_2', 'joy_3', 'sad_0', 'sad_1', 'sad_2', 'sad_3', 'surprise_0', 'surprise_1', 'surprise_2', 'surprise_3']
Validation DF length: 415 --> ['text', 'anger_0', 'anger_1', 'anger_2', 'anger_3', 'fear_0', 'fear_1', 'fear_2', 'fear_3', 'joy_0', 'joy_1', 'joy_2', 'joy_3', 'sad_0', 'sad_1', 'sad_2', 'sad_3', 'surprise_0', 'surprise_1', 'surprise_2', 'surprise_3']
Testing DF length: 416 --> ['text', 'anger_0', 'anger_1', 'anger_2', 'anger_3', 'fear_0', 'fear_1', 'fear_2', 'fear_3', 'joy_0', 'joy_1', 'joy_2', 'joy_3', 'sad_0', 'sad_1', 'sad_2', 'sad_3', 'surprise_0', 'surprise_1', 'surprise_2', 'surprise_3']


## Save Preprocessed Data

In [76]:
# Save preprocessed data
save_dir_name = lang + '_' + '_'.join([str(int(split_size * 100)) for split_size in split_sizes])
save_dir = os.path.join(preprocessed_data_dir, 'track_b', save_dir_name)

!mkdir -p $save_dir

train_df_.to_csv(os.path.join(save_dir, 'train.csv'), index=False)
val_df.to_csv(os.path.join(save_dir, 'val.csv'), index=False)
test_df.to_csv(os.path.join(save_dir, 'test.csv'), index=False)
# dev_df.to_csv(os.path.join(save_dir, 'dev.csv'), index=False)

print("Saved to:", save_dir)

Saved to: ./data/preprocessed_data/track_b/eng_70_15_15


## Upload Preprocessed Data to Hugging Face

In [78]:
hf_api = HfApi()
hf_api.upload_folder(
    repo_id=repo_id,
    repo_type='dataset',
    folder_path=save_dir,
    path_in_repo=os.path.join('preprocessed_data/track_b', save_dir_name),
)

CommitInfo(commit_url='https://huggingface.co/datasets/alxxtexxr/SemEval2025-Task11-Dataset/commit/a63da5210ff95234be05ce46daabfd77ff824dde', commit_message='Upload folder using huggingface_hub', commit_description='', oid='a63da5210ff95234be05ce46daabfd77ff824dde', pr_url=None, pr_revision=None, pr_num=None)