# Libraries

In [4]:
import os
import random
import numpy as np
import pandas as pd
import torch
import transformers
from datasets import load_dataset

# Config

In [19]:
seed = 42

hf_data_id = 'alxxtexxr/sundanese-twitter-dataset'
hf_data_config = '2506_12_12'

In [6]:
def set_seed(seed):
    # Set random seed for NumPy
    np.random.seed(seed)

    # Set random seed for Torch
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if using multi-GPU
    torch.backends.cudnn.deterministic = True  # Ensures deterministic results
    torch.backends.cudnn.benchmark = False  # Avoids non-deterministic algorithms

    # Set random seed for Transformers
    transformers.set_seed(seed)

    # Optionally set random seed for sklearn and Python's own random module
    random.seed(seed)

    # Set random seed for os
    os.environ['PYTHONHASHSEED'] = str(seed)

    print(f"Random seed set to: {seed}")

set_seed(seed)

Random seed set to: 42


# Data

## Load Data

In [20]:
datasets = load_dataset(hf_data_id, hf_data_config)
cols = list(datasets['train'].features)
splits = [*datasets.keys()]

print("Splits:", splits)
print("Data columns:", cols)

Splits: ['train', 'val', 'test']
Data columns: ['label', 'data', 'emotion']


In [21]:
df = {split: pd.DataFrame(datasets[split]) for split in splits}
df_all = pd.concat(df.values())
df_all.head()

Unnamed: 0,label,data,emotion
0,2,"Aih, hatur nuhun atuh nya Meuni nyaah pisan ur...",senang
1,2,Ikut bangga ya allah liat ales dinotic. Alhamd...,senang
2,0,"tong melak cangkeng wae maneh teh, teu sopan",marah
3,3,Jalan nu bade ka pangandaran ge nu pas bulak-b...,takut
4,1,"Sedih pisan eta asli lah, kami geus nonton sam...",sedih


## EDA

In [16]:
print("Training dataset size:", len(df_all))

Training dataset size: 2530


In [18]:
print("Emotions distribution:")
df['train']['emotion'].value_counts()

Emotions distribution:


emotion
senang    634
marah     625
sedih     624
takut     623
Name: count, dtype: int64