# Libraries

In [1]:
import os
import random
import numpy as np
import pandas as pd
import torch
import transformers
from datasets import load_dataset

# Config

In [226]:
seed = 1337

hf_data_id = 'alxxtexxr/SemEval2025-Task11-Dataset'
hf_data_config = 'track_a_sun_70_15_15_stratify_v2'
prev_sampled_data_paths = [
    # 
]

# eng2idn_emotion_map = {
#     'anger': 'marah', 
#     'disgust': 'jijik', 
#     'fear': 'takut', 
#     'joy': 'senang', 
#     'sadness': 'sedih', 
#     'surprise': 'terkejut', 
#     'neutral': 'biasa',
# }
# emotions = ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 
#             'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 
#             'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 
#             'remorse', 'sadness', 'surprise', 'neutral']
# valid_emotions = ['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'neutral']
# selected_emotion = 'neutral'
# prev_selected_emotions = ['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']

num_samples_single = 10 * 5 # if greather than 100, times 1.5; if lesser than 100 times 2; if lesser than 50 times 3; if lesser than 10 times 5
num_samples_comb = 0

In [227]:
def set_seed(seed):
    # Set random seed for NumPy
    np.random.seed(seed)

    # Set random seed for Torch
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if using multi-GPU
    torch.backends.cudnn.deterministic = True  # Ensures deterministic results
    torch.backends.cudnn.benchmark = False  # Avoids non-deterministic algorithms

    # Set random seed for Transformers
    transformers.set_seed(seed)

    # Optionally set random seed for sklearn and Python's own random module
    random.seed(seed)

    # Set random seed for os
    os.environ['PYTHONHASHSEED'] = str(seed)

    print(f"Random seed set to: {seed}")

set_seed(seed)

Random seed set to: 1337


# Data

## Load Data

In [228]:
# dataset = load_dataset(hf_data_id, hf_data_config, split='train')
# print(dataset)

In [229]:
# df = pd.DataFrame(dataset)
# print("DF size:", len(df))
# df.head()

In [230]:
# # One-hot encode column labels into emotion columns
# for label, emotion in enumerate(emotions):
#     df[emotion] = df['labels'].apply(lambda labels: 1 if label in labels else 0)

# df.head()

## Filter Data with Invalid Emotions

In [231]:
# invalid_emotions = list(set(emotions) - set(valid_emotions))

# print("Valid emotions:", valid_emotions)
# print("Invalid emotions:", invalid_emotions)

In [232]:
# df_filtered = df[~df[invalid_emotions].eq(1).any(axis=1)]
# print("Filtered DF size:", len(df_filtered))

In [233]:
# df_filtered.to_csv('data/go_emotions_sun/filtered.csv', index=False)
df_filtered = pd.read_csv('data/go_emotions_sun/filtered.csv')
print("Filtered DF size:", len(df_filtered))

Filtered DF size: 17529


## Select Data with Specified Emotion

In [234]:
df_selected = df_filtered[df_filtered[selected_emotion] == 1][['id', 'text'] + valid_emotions]
print("Selected DF size:", len(df_selected))
df_selected.head()

Selected DF size: 13061


Unnamed: 0,id,text,anger,disgust,fear,joy,sadness,surprise,neutral
0,eebbqej,My favourite food is anything I didn't have to...,0,0,0,0,0,0,1
1,ed00q6i,"Now if he does off himself, everyone will thin...",0,0,0,0,0,0,1
5,eczgv1o,It might be linked to the trust factor of your...,0,0,0,0,0,0,1
6,ef83m1s,Hello everyone. Im from Toronto as well. Can c...,0,0,0,0,0,0,1
8,eeb9aft,that is what retardation looks like,0,0,0,0,0,0,1


## Clean Data Before Sampling

In [235]:
if len(prev_sampled_data_paths):
    df_sampled_prev = pd.concat([pd.read_csv(prev_sampled_data_path) for prev_sampled_data_path in prev_sampled_data_paths])
    df_cleaned = df_selected[~df_selected['id'].isin(df_sampled_prev['id'])]
    print("Cleaned DF size (after cleaning prev. sampled data):", len(df_cleaned))
else:
    df_cleaned = df_selected

Cleaned DF size (after cleaning prev. sampled data): 12562


In [236]:
print(df_cleaned[valid_emotions].sum())
print()

df_cleaned = df_cleaned[(df_cleaned[prev_selected_emotions] == 0).all(axis=1)]
print(df_cleaned[valid_emotions].sum())

anger          68
disgust        28
fear           17
joy            43
sadness        53
surprise       31
neutral     12562
dtype: int64

anger           0
disgust         0
fear            0
joy             0
sadness         0
surprise        0
neutral     12324
dtype: int64


In [237]:
print("Total duplicates by ID:", df_cleaned['id'].duplicated().sum())
print("Total duplicates by text:", df_cleaned['text'].duplicated().sum())
print()

df_cleaned = df_cleaned.drop_duplicates(subset=['text'], keep='first')

print("Total duplicates by ID (after cleaning duplicates):", df_cleaned['id'].duplicated().sum())
print("Total duplicates by text (after cleaning duplicates):", df_cleaned['text'].duplicated().sum())
print()

print("Cleaned DF size (after cleaning duplicates):", len(df_cleaned))

Total duplicates by ID: 0
Total duplicates by text: 25

Total duplicates by ID (after cleaning duplicates): 0
Total duplicates by text (after cleaning duplicates): 0

Cleaned DF size (after cleaning duplicates): 12299


In [238]:
cond_r = df_cleaned['text'].str.contains("r/")
cond_u = df_cleaned['text'].str.contains("/u/")
cond_name = df_cleaned['text'].str.contains("NAME]")
cond_religion = df_cleaned['text'].str.contains("RELIGION]")
cond_downvot = df_cleaned['text'].str.contains("downvot")
cond_upvot = df_cleaned['text'].str.contains("upvot")
# cond_double = df_cleaned['text'].str.contains(r'(.)\1{2,}')

df_cleaned = df_cleaned[~(cond_r | cond_u | cond_name | cond_religion | cond_downvot | cond_upvot)]
print("Cleaned DF size (after cleaning with specific conditions):", len(df_cleaned))

Cleaned DF size (after cleaning with specific conditions): 9936


## Sample Data

In [239]:
df_cleaned['num_emotions'] = df_cleaned.apply(lambda row: int(sum(row[valid_emotions].tolist())), axis=1)

df_single = df_cleaned[(df_cleaned[selected_emotion] == 1) & (df_cleaned['num_emotions'] == 1)]
df_comb = df_cleaned[(df_cleaned[selected_emotion] == 1) & (df_cleaned['num_emotions'] > 1)]

print("Total data with single emotion:", len(df_single))
print("Total data with combined emotions:", len(df_comb))

Total data with single emotion: 9936
Total data with combined emotions: 0


In [240]:
num_samples_single = min(int(num_samples_single), len(df_single))
num_samples_comb = min(int(num_samples_comb), len(df_comb))

df_sampled_single = df_single.sample(n=num_samples_single, random_state=seed)
df_sampled_comb = df_comb.sample(n=num_samples_comb, random_state=seed)
df_sampled = pd.concat([df_sampled_single, df_sampled_comb])[['id', 'text'] + valid_emotions]
# df_sampled['neutral'] = 0

print("Total sampled data with single emotion:", len(df_sampled_single))
print("Total sampled data with combined emotions:", len(df_sampled_comb))
print("Total sampled data:", len(df_sampled))
df_sampled.head()

Total sampled data with single emotion: 50
Total sampled data with combined emotions: 0
Total sampled data: 50


Unnamed: 0,id,text,anger,disgust,fear,joy,sadness,surprise,neutral
12509,ee54dcb,"haha, rape",0,0,0,0,0,0,1
3211,edkr19d,That's what they're for! I wouldn't use it for...,0,0,0,0,0,0,1
379,efcsmiw,"Damn, hoped i had missed some fan translations...",0,0,0,0,0,0,1
11076,ee43fc3,Kid has profanity on the front of his shirt......,0,0,0,0,0,0,1
5103,ef4fl50,reddit was having server issues,0,0,0,0,0,0,1


## Save Data

In [241]:
df_sampled = df_sampled.rename(columns=eng2idn_emotion_map)
df_sampled['emotion'] = df_sampled.apply(lambda row: ', '.join([eng2idn_emotion_map[emotion] for emotion in valid_emotions if row[eng2idn_emotion_map[emotion]] == 1]), axis=1)
df_sampled['emotion'].value_counts()

emotion
biasa    50
Name: count, dtype: int64

In [242]:
save_path = f'./data/go_emotions_sun/{selected_emotion}_single_{num_samples_single}_comb_{num_samples_comb}.csv'
df_sampled.to_csv(save_path, index=False)
print("Saved to:", save_path)

Saved to: ./data/go_emotions_sun/neutral_single_50_comb_0.csv
