# Libraries

In [107]:
import os
import random
import numpy as np
import pandas as pd
import unicodedata
from sklearn.model_selection import train_test_split
from huggingface_hub import HfApi

# Config

In [108]:
seed = 42
lang = 'sun'

split = 'train' 
raw_data_path = f'data/public_data_dev/track_a/{split}/sun.csv'
preprocessed_data_dir = './data/preprocessed_data_raw/'

hf_data_id = 'alxxtexxr/SemEval2025-Task11-Dataset'
hf_data_config = lang
print("Hugging Face dataset config:", hf_data_config)

Hugging Face dataset config: sun


In [109]:
def set_seed(seed):
    # Set random seed for NumPy
    np.random.seed(seed)

    # Optionally set random seed for sklearn and Python's own random module
    random.seed(seed)

    # Set random seed for os
    os.environ['PYTHONHASHSEED'] = str(seed)

    print(f"Random seed set to: {seed}")

set_seed(seed)

Random seed set to: 42


# Data

## Preprocess Data

In [110]:
df = pd.read_csv(raw_data_path)
print("Raw DF length:", len(df))
print()
df.head()

Raw DF length: 924



Unnamed: 0,id,text,anger,disgust,fear,joy,sadness,surprise
0,sun_train_track_a_00001,Kumaha mang fiksi engke ka sabilulungan mang s...,0,0,0,1,0,1
1,sun_train_track_a_00002,tapi domba anakan namah lain ku kuring mreun. ...,0,0,0,1,0,1
2,sun_train_track_a_00003,Aduh mang naha bet kudu penting di upload ma😂😂,0,0,0,1,0,0
3,sun_train_track_a_00004,"pokonamah nuhun sabandungeun , kita terus ting...",0,0,0,1,1,0
4,sun_train_track_a_00005,Kang eta teu isin?? Apa emng tukang ngisinkeun...,0,0,0,1,0,1


In [111]:
# df['text'] = df['text'].str.replace(r'\b\d{1,2}:\s?\d{2}\b', '', regex=True)
df['text'] = df['text'].str.replace(r'(?<!menit\s)\b\d{1,2}:\s?\d{2}\b', '', regex=True)  # Remove timestamps
# df['text'] = df['text'].str.replace(r'^#\w+', '', regex=True) # Remove hashtags at the start of the text
df['text'] = df['text'].str.replace(r'#(\w+)', r'\1', regex=True) # Remove # from the remaining hashtags

# Convert dialogue formats to use ':'
df['text'] = df['text'].str.replace(r'<(\w+)>', r'\1:', regex=True)
df['text'] = df['text'].str.replace(r'([A-Z])-', r'\1:', regex=True)
df['text'] = df['text'].str.replace(r'([A-Z]) -', r'\1:', regex=True)
df['text'] = df['text'].str.replace(r'([A-Z])=', r'\1:', regex=True)
df['text'] = df['text'].str.replace(r'([A-Z]) =', r'\1:', regex=True)

# Replace dialogue names
df['text'] = df['text'].str.replace(r'\b[Qq]:', 'Batur:', regex=True)
df['text'] = df['text'].apply(lambda x: x.replace('A:', 'Urang:') if 'Batur:' in x else x)

df['text'] = df['text'].str.replace(r'\b[Aa]:', 'Batur:', regex=True)
df['text'] = df['text'].str.replace(r'\b[Bb]:', 'Urang:', regex=True)

df['text'] = df['text'].str.replace(r'-[a-zA-Z0-9]+', '', regex=True) # Remove patterns like -ic3rd4ow4d

# Remove rows with only one word
df = df[df['text'].str.count(r'\s+') > 0]

df['text'] = df['text'].str.strip()
df['text'] = df['text'].str.lstrip(',.!:$%^&*+') # Remove leading punctuation marks (comma, dot, etc.)
df['text'] = df['text'].str.strip()

df[120:130]

Unnamed: 0,id,text,anger,disgust,fear,joy,sadness,surprise
120,sun_train_track_a_00121,"Mantapppp mang, Kadieu amengan, Kamari willy p...",0,0,0,1,0,0
121,sun_train_track_a_00122,"Sae, lagu ieu di cover jadi aya sérédan, mantul",0,0,0,1,0,0
122,sun_train_track_a_00123,"Moal gagal neng azmy mah juara , Gaskeun",0,0,0,1,0,0
123,sun_train_track_a_00124,Ngeunah asli 3an ge wanieun gurihh,0,0,0,1,0,0
124,sun_train_track_a_00125,"Pernah teu ngalaman meuli barang nu geuleuh, ?...",0,1,0,0,1,0
125,sun_train_track_a_00126,Karek wantun nonton mang video na asa kacandak...,0,0,0,1,0,1
126,sun_train_track_a_00127,Kang pami beli kaos urang Sunda dimana?,0,0,0,1,0,0
127,sun_train_track_a_00128,Tuh mang Fiksi anu dislike NU rarumasa OPEN BO...,0,1,0,1,0,0
128,sun_train_track_a_00129,Enakeun adem banget sholawat thohirul qolbinyaa,0,0,0,1,0,0
129,sun_train_track_a_00130,Sok mang nyieun pidio ( e jeung eu ) tah nu he...,0,0,0,1,0,0


In [112]:
def contains_non_ascii(text):
    try:
        text.encode('ascii')
    except UnicodeEncodeError:
        return True
    return False

print("Total data with non-ASCII chars:", int(df['text'].apply(contains_non_ascii).sum()))

Total data with non-ASCII chars: 375


In [113]:
# def normalize_to_ascii(text):
#     return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('ascii')

def normalize_to_ascii_keep_emojis(text):
    # Normalize the text to decompose accented characters
    text = unicodedata.normalize('NFKD', text)
    # Remove only combining marks (like accents) while keeping emojis and other symbols
    text = ''.join(char for char in text if not unicodedata.combining(char))
    return text

# Normalize to ASCII equivalents
df['text'] = df['text'].apply(normalize_to_ascii_keep_emojis)
print("Total data with non-ASCII chars (after normalizing them):", int(df['text'].apply(contains_non_ascii).sum()))

df.head()

Total data with non-ASCII chars (after normalizing them): 364


Unnamed: 0,id,text,anger,disgust,fear,joy,sadness,surprise
0,sun_train_track_a_00001,Kumaha mang fiksi engke ka sabilulungan mang s...,0,0,0,1,0,1
1,sun_train_track_a_00002,tapi domba anakan namah lain ku kuring mreun. ...,0,0,0,1,0,1
2,sun_train_track_a_00003,Aduh mang naha bet kudu penting di upload ma😂😂,0,0,0,1,0,0
3,sun_train_track_a_00004,"pokonamah nuhun sabandungeun , kita terus ting...",0,0,0,1,1,0
4,sun_train_track_a_00005,Kang eta teu isin?? Apa emng tukang ngisinkeun...,0,0,0,1,0,1


## Save Preprocessed Data

In [114]:
save_dir = os.path.join(preprocessed_data_dir, 'track_a', hf_data_config)

!mkdir -p $save_dir

df.to_csv(os.path.join(save_dir, f'{split}.csv'), index=False)

print("Saved to:", save_dir)

Saved to: ./data/preprocessed_data_raw/track_a/sun


## Upload Preprocessed Data to Hugging Face

In [105]:
hf_api = HfApi()
hf_api.upload_folder(
    repo_id=hf_data_id,
    repo_type='dataset',
    folder_path=save_dir,
    path_in_repo=os.path.join('preprocessed_data_raw/track_a', hf_data_config),
)

CommitInfo(commit_url='https://huggingface.co/datasets/alxxtexxr/SemEval2025-Task11-Dataset/commit/d8c2daa844f41687aca2c0dcbf02024094385bfa', commit_message='Upload folder using huggingface_hub', commit_description='', oid='d8c2daa844f41687aca2c0dcbf02024094385bfa', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/alxxtexxr/SemEval2025-Task11-Dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='alxxtexxr/SemEval2025-Task11-Dataset'), pr_revision=None, pr_num=None)