# Libraries

In [102]:
import pandas as pd

# Config

In [103]:
data_path = 'data/go_emotions_sun/neutral_single_50_comb_0_translated_cleaned_curation.csv'

idn2eng_emotion_map = {
    'marah': 'anger',
    'jijik': 'disgust',
    'takut': 'fear',
    'senang': 'joy',
    'sedih': 'sadness',
    'terkejut': 'surprise',
    'biasa': 'neutral',
}
valid_emotions = ['marah', 'jijik', 'takut', 'senang', 'sedih', 'terkejut', 'biasa']
selected_emotion = 'biasa'
# prev_selected_emotions = ['senang']

# Data

## Load Data

In [104]:
df = pd.read_csv(data_path)
print("DF size:", len(df))
df.head()

DF size: 50


Unnamed: 0,id,curation_status,text,text_translated,emotion,marah,jijik,takut,senang,sedih,terkejut,biasa
0,ef4fl50,1,reddit was having server issues,reddit keur boga masalah server,biasa,0,0,0,0,0,0,1
1,ee31gyc,1,I just don't see it.,Abdi teu ningali éta.,biasa,0,0,0,0,0,0,1
2,eeyxd7d,1,"That thing is rather thin , it wouldn't be a b...","Eta barang rada ipis, teu bakal jadi masalah g...",biasa,0,0,0,0,0,0,1
3,eczusn7,1,its north park,eta taman kalér,biasa,0,0,0,0,0,0,1
4,edqjl3r,1,I have mixed feelings about discovering aliens...,Abdi gaduh perasaan campur aduk ngeunaan menda...,takut,0,0,1,0,0,0,0


## Clean Data

In [105]:
if 'curation_status' in df.columns:
    df_cleaned = df[df['curation_status'] == 1]
    print("Cleaned DF size (after curation):", len(df_cleaned))
else:
    df_cleaned = df

Cleaned DF size (after curation): 11


In [106]:
population_df = pd.read_csv('data/go_emotions_sun/filtered.csv')

df_cleaned = df_cleaned[df_cleaned['id'].isin(population_df['id'])]
print("Cleaned DF size (after matching with the population):", len(df_cleaned))

Cleaned DF size (after matching with the population): 11


In [107]:
print(df_cleaned[valid_emotions].sum())
print()

if 'prev_selected_emotions' in globals():
    df_cleaned = df_cleaned[(df_cleaned[prev_selected_emotions] == 0).all(axis=1)] # type: ignore
    print(df_cleaned[valid_emotions].sum())

marah        0
jijik        0
takut        1
senang       0
sedih        0
terkejut     0
biasa       10
dtype: int64



In [108]:
print("Total duplicates:", df_cleaned['text'].duplicated().sum())
df_cleaned = df_cleaned.drop_duplicates(subset=['text'], keep='first')
print("Total duplicates (after cleaning duplicates):", df_cleaned['text'].duplicated().sum())
print()
print("Cleaned DF size (after cleaning duplicates):", len(df_cleaned))

Total duplicates: 0
Total duplicates (after cleaning duplicates): 0

Cleaned DF size (after cleaning duplicates): 11


In [109]:
cond_r = df_cleaned['text_translated'].str.contains("r/")
cond_u = df_cleaned['text_translated'].str.contains("/u/")
cond_maaf = df_cleaned['text_translated'].str.lower().str.contains("maaf")
# cond_terjemah = df_cleaned['text_translated'].str.contains('terjemah')
# cond_double = df_cleaned['text_translated'].str.contains(r'(.)\1{2,}')

df_cleaned = df_cleaned[~(cond_r | cond_u | cond_maaf)]
print("Cleaned DF size (after cleaning with specific conditions):", len(df_cleaned))
df_cleaned.head()

Cleaned DF size (after cleaning with specific conditions): 11


Unnamed: 0,id,curation_status,text,text_translated,emotion,marah,jijik,takut,senang,sedih,terkejut,biasa
0,ef4fl50,1,reddit was having server issues,reddit keur boga masalah server,biasa,0,0,0,0,0,0,1
1,ee31gyc,1,I just don't see it.,Abdi teu ningali éta.,biasa,0,0,0,0,0,0,1
2,eeyxd7d,1,"That thing is rather thin , it wouldn't be a b...","Eta barang rada ipis, teu bakal jadi masalah g...",biasa,0,0,0,0,0,0,1
3,eczusn7,1,its north park,eta taman kalér,biasa,0,0,0,0,0,0,1
4,edqjl3r,1,I have mixed feelings about discovering aliens...,Abdi gaduh perasaan campur aduk ngeunaan menda...,takut,0,0,1,0,0,0,0


In [110]:
df_cleaned.loc[:, 'num_emotions'] = df_cleaned.apply(lambda row: int(sum(row[valid_emotions].tolist())), axis=1)

df_cleaned_single = df_cleaned[(df_cleaned[selected_emotion] == 1) & (df_cleaned['num_emotions'] == 1)]
df_cleaned_comb = df_cleaned[(df_cleaned[selected_emotion] == 1) & (df_cleaned['num_emotions'] > 1)]

print("Total cleaned data with single emotion:", len(df_cleaned_single))
print("Total cleaned data with combined emotions:", len(df_cleaned_comb))

Total cleaned data with single emotion: 10
Total cleaned data with combined emotions: 0


## Save Data

In [111]:
if 'num_emotions' in df_cleaned.columns:
    df_cleaned.drop('num_emotions', axis=1, inplace=True)

save_path = f'data/go_emotions_sun/{idn2eng_emotion_map[selected_emotion]}_single_{len(df_cleaned_single)}_comb_{len(df_cleaned_comb)}_translated_cleaned{"_curated" if "curation_status" in df.columns else ""}.csv'
df_cleaned.to_csv(save_path, index=False)
print("Saved to:", save_path)

Saved to: data/go_emotions_sun/neutral_single_10_comb_0_translated_cleaned_curated.csv
