# Libraries

In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from huggingface_hub import HfApi

# Config

In [2]:
seed = 42
data_dir = './data/sundanese_twitter_dataset'
preprocessed_data_dir = './data/sundanese_twitter_dataset/processed_data'

# Data

## Download Data

In [3]:
# !wget -P $data_dir https://raw.githubusercontent.com/virgantara/sundanese-twitter-dataset/refs/heads/master/sundanese.csv
# !wget -P $data_dir https://raw.githubusercontent.com/virgantara/sundanese-twitter-dataset/refs/heads/master/newdataset.csv
# !wget -P $data_dir https://raw.githubusercontent.com/virgantara/sundanese-twitter-dataset/refs/heads/master/newone.csv
# !wget -P $data_dir https://raw.githubusercontent.com/virgantara/sundanese-twitter-dataset/refs/heads/master/testdataset.csv

## Load Data

In [9]:
# newdataset_df = pd.read_csv(os.path.join(data_dir, 'newdataset.csv'))
newone_df = pd.read_csv(os.path.join(data_dir, 'newone.csv'))
# sundanese_df = pd.read_csv(os.path.join(data_dir, 'sundanese.csv'))
testdataset_df = pd.read_csv(os.path.join(data_dir, 'testdataset.csv'))

# print("Length of newdataset_df:", len(newdataset_df))
print("Length of newone_df:", len(newone_df))
# print("Length of sundanese_df:", len(sundanese_df))
print("Length of testdataset_df:", len(testdataset_df))
print()
# print("newdataset_df Columns:", list(newdataset_df.columns))
print("newone_df Columns:", list(newone_df.columns))
# print("sundanese_df Columns:", list(sundanese_df.columns))
print("testdataset_df Columns:", list(testdataset_df.columns))
print()

print("newone_df labels:", newone_df['label'].unique())
print("testdataset_df labels:", testdataset_df['label'].unique())

Length of newone_df: 2518
Length of testdataset_df: 12

newone_df Columns: ['label', 'data']
testdataset_df Columns: ['label', 'data']

newone_df labels: [2 1 4 3]
testdataset_df labels: ['marah' 'takut' 'sedih' 'senang']


## Create Label Data

In [11]:
emotion2label = {
    'marah': 0,
    'sedih': 1,
    'senang': 2,
    'takut': 3,
}

train_val_df = newone_df.copy()
train_val_df['emotion'] = train_val_df['label'].map({
    1: 'marah',
    2: 'sedih',
    3: 'senang',
    4: 'takut',
})
train_val_df['label'] = train_val_df['emotion'].map(emotion2label)
train_val_df.head()

Unnamed: 0,label,data,emotion
0,1,...nepi ayeuna mun inget indung kadang sok cir...,sedih
1,0,"""Hidup aing, kumaha aing"" Kalo hidup maneh nyi...",marah
2,3,"""Insidious1 aja, udah bikin muringkak. Komo In...",takut
3,2,"""Kami fokeus di AFC..hahahahaha.."" ngeunah seu...",senang
4,3,"""Mah potoin iin ih sama jurig itu"" bari jeung ...",takut


In [12]:
test_df = testdataset_df.rename(columns={'label': 'emotion'})
test_df['label'] = test_df['emotion'].map(emotion2label)
test_df.head()

Unnamed: 0,emotion,data,label
0,marah,"Kade kabentar ah, lagi males marah2. Modar we ...",0
1,marah,jadi lieur anjing hayang modar,0
2,marah,"Mamanas wae sia mah anjing, gera modar sia",0
3,takut,Punten ini akurat ga ya sieun ihh daerah aku m...,3
4,takut,"Teu hayang, sieun geus gede dikerekeb.... moda...",3


## Stratified Split Data

In [15]:
test_size = len(test_df)
val_size = test_size
train_size = len(train_val_df) - val_size

print("Testing split size:", test_size)
print("Validation split size:", val_size)
print("Training split size:", train_size)

Testing split size: 12
Validation split size: 12
Training split size: 2506


In [16]:
train_df, val_df = train_test_split(train_val_df,
                                    train_size=train_size,
                                    stratify=train_val_df['label'],
                                    random_state=seed)

print("Length of train_df:", len(train_df))
print("Length of val_df:", len(val_df))
print("Length of test_df:", len(test_df))

Length of train_df: 2506
Length of val_df: 12
Length of test_df: 12


## Save Preprocessed Data

In [17]:
hf_data_id = 'alxxtexxr/sundanese-twitter-dataset'
hf_data_config = f'{train_size}_{val_size}_{test_size}'
print("Hugging Face data ID:", hf_data_id)
print("Hugging Face data config:", hf_data_config)

Hugging Face data ID: alxxtexxr/sundanese-twitter-dataset
Hugging Face data config: 2506_12_12


In [18]:
save_dir = os.path.join(preprocessed_data_dir, hf_data_config)

!mkdir -p $save_dir

train_df.to_csv(os.path.join(save_dir, 'train.csv'), index=False)
val_df.to_csv(os.path.join(save_dir, 'val.csv'), index=False)
test_df.to_csv(os.path.join(save_dir, 'test.csv'), index=False)
# dev_df.to_csv(os.path.join(save_dir, 'dev.csv'), index=False)

print("Saved to:", save_dir)

Saved to: ./data/sundanese_twitter_dataset/processed_data/2506_12_12


## Upload Preprocessed Data to Hugging Face

In [19]:
hf_api = HfApi()
hf_api.upload_folder(
    repo_id=hf_data_id,
    repo_type='dataset',
    folder_path=save_dir,
    path_in_repo=os.path.join('preprocessed_data', hf_data_config),
)

CommitInfo(commit_url='https://huggingface.co/datasets/alxxtexxr/sundanese-twitter-dataset/commit/790d99d62e058768c35afa532aec95fb6a67f70a', commit_message='Upload folder using huggingface_hub', commit_description='', oid='790d99d62e058768c35afa532aec95fb6a67f70a', pr_url=None, pr_revision=None, pr_num=None)