# Libraries

In [304]:
import json
import re
import pandas as pd
from openai import OpenAI
from sklearn.metrics import f1_score
from pprint import pprint

pd.set_option('max_colwidth', None)

# Config

In [295]:
seed = 42

client = OpenAI(api_key=json.load(open('secrets.json'))['OPENAPI_SECRET_KEY'])

idn2eng_emotion_map = {
    'marah': 'anger',
    'jijik': 'disgust',
    'takut': 'fear',
    'senang': 'joy',
    'sedih': 'sadness',
    'terkejut': 'surprise',
    'biasa': 'neutral',
}

# Data

## Load Data

In [333]:
df = pd.read_csv('data/preprocessed_data/track_a/sun_70_15_15_stratify_v2/val.csv')
df = df.rename(columns=idn2eng_emotion_map) # Translate emotion columns from Indonesian to English

print("DF size:", len(df))
df.head()

DF size: 128


Unnamed: 0,text,emotion,anger,disgust,fear,joy,sadness,surprise,neutral
0,alhamdulillah hatur nuhun otw nonton ieu mah,senang,0,0,0,1,0,0,0
1,"Nongton iklan nepi ka tamat ,😭",senang,0,0,0,1,0,0,0
2,min ai mnh teu cape lulumpatan wae :v kumaha weh ngagawe keun jelma mah ceuk mang dana ge :v,"senang, terkejut",0,0,0,1,0,1,0
3,"Hebaaat , mang mung basa sundana rada rapihken ulah babeulit, nya, nya","senang, terkejut",0,0,0,1,0,1,0
4,Aya meri dina rakit Boboko wadah bakatul Lain nyeri ku panyakit Kabogoh direbut batur🤣🤣😭,"senang, sedih, terkejut",0,0,0,1,1,1,0


In [342]:
# Remove data with 'neutral' emotion and the 'neutral' column
if 'neutral' in df.columns:
    df = df[~(df['neutral'] == 1)]  
    df = df.drop('neutral', axis=1) 

# Store emotion columns
emotion_cols = [col for col in df.columns if col not in ['Unnamed: 0', 'text', 'emotion', 'id']]
df['emotion'] = df.apply(lambda row: ', '.join([emotion for emotion in emotion_cols if row[emotion] == 1]), axis=1)

print("DF size (after removing emotion 'neutral'):", len(df))
print("Emotion columns:", emotion_cols)
df.head()

DF size (after removing emotion 'neutral'): 121
Emotion columns: ['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']


Unnamed: 0,text,emotion,anger,disgust,fear,joy,sadness,surprise
0,alhamdulillah hatur nuhun otw nonton ieu mah,joy,0,0,0,1,0,0
1,"Nongton iklan nepi ka tamat ,😭",joy,0,0,0,1,0,0
2,min ai mnh teu cape lulumpatan wae :v kumaha weh ngagawe keun jelma mah ceuk mang dana ge :v,"joy, surprise",0,0,0,1,0,1
3,"Hebaaat , mang mung basa sundana rada rapihken ulah babeulit, nya, nya","joy, surprise",0,0,0,1,0,1
4,Aya meri dina rakit Boboko wadah bakatul Lain nyeri ku panyakit Kabogoh direbut batur🤣🤣😭,"joy, sadness, surprise",0,0,0,1,1,1


# Evaluation

In [307]:
prompt = """You will be provided with a text in Sundanese. Your task is to identify the emotion(s) expressed in the text. This is a multi-label classification task, meaning the text may convey one or more emotions simultaneously. You must always provide a result; there should never be an instance with no identified emotion(s).

## PREDEFINED EMOTION LABELS
```
- anger
- fear
- joy
- sadness
- surprise
```

## GUIDELINES
1. Laughter-like expressions, such as "hahaha," "wkwkw," or similar, typically indicate `joy`.
2. Smile or laugh emojis, such as 😊, ☺️, 😃, 😁, 😆, 😇, 🥰, 😍, 😋, 🤗, 😌, typically indicate `joy`.
3. Insults typically indicate both `anger` and `disgust`.
4. The emoji 😭 does not always indicate `sadness`. In humorous contexts, it can represent extreme laughter that brings tears, indicating `joy`.
5. In Sundanese, "NU" often means "yang" in Indonesian, not "Nahdlatul Ulama". Always interpret "NU" based on the context.

## STEP-BY-STEP INSTRUCTIONS
1. Identify all keyword(s) in the text that carry significant emotion-related information. For each keyword, conduct an in-depth analysis to determine one or more emotions it conveys. Express your analysis as a natural, conversational internal monologue. Follow these principles:

a. Translate the Keyword
- Translate the identified keyword from Sundanese to English. 
- Understand its meaning in both languages.

b. Refer to the Guidelines First
- Always refer to the provided guidelines first to determine the emotion(s) associated with the keyword.
- If any guideline applies, follow it strictly and conclude your analysis.
- If no guideline applies, proceed to construct your reasoning.

c. Construct Reasoning
- Adopt a mindset of self-doubt and curiosity. Ask numerous questions and explore different possibilities.
- Avoid rushing to conclusions; instead, iterate and refine your thoughts through multiple rounds of questioning.
- Build your understanding progressively by answering questions step by step. If uncertainties remain, continue exploring, reasoning, and revising until a comprehensive understanding emerges.

2. Based on the identified keyword(s), determine the overall emotion(s) expressed in the text.

## CONVERSATIONAL INNER MONOLOGUE STYLE
1. Following the Guidelines
- "Based on the guidelines"
- "According to the guidelines"

2. Natural Thought Flow
- "Hmm... let me think about this"
- "Going back to what I thought earlier"

3. Progressive Building
- "Let me break this down further"
- "Building on that last point"
- "This connects to what I noticed earlier"

## OUTPUT FORMAT
<keywords>

For each keyword, return in the format: 
<keyword>
- Keyword: The identified keyword.
- Analysis: Your in-depth analysis (minimum 1000 characters).
- Emotions: A List of one or more specified emotions separated by commas. Include the potential emotion too.
</keyword>

</keywords>

<emotions>
A list of one or more emotion(s) separated by commas.
</emotions>"""

print(prompt)

You will be provided with a text in Sundanese. Your task is to identify the emotion(s) expressed in the text. This is a multi-label classification task, meaning the text may convey one or more emotions simultaneously. You must always provide a result; there should never be an instance with no identified emotion(s).

## PREDEFINED EMOTION LABELS
```
- anger
- fear
- joy
- sadness
- surprise
```

## GUIDELINES
1. Laughter-like expressions, such as "hahaha," "wkwkw," or similar, typically indicate `joy`.
2. Smile or laugh emojis, such as 😊, ☺️, 😃, 😁, 😆, 😇, 🥰, 😍, 😋, 🤗, 😌, typically indicate `joy`.
3. Insults typically indicate both `anger` and `disgust`.
4. The emoji 😭 does not always indicate `sadness`. In humorous contexts, it can represent extreme laughter that brings tears, indicating `joy`.
5. In Sundanese, "NU" often means "yang" in Indonesian, not "Nahdlatul Ulama". Always interpret "NU" based on the context.

## STEP-BY-STEP INSTRUCTIONS
1. Identify all keyword(s) in the text that 

In [343]:
def one_hot_encode_emotions(emotions, emotion_cols=emotion_cols):
    one_hot_emotion = [1 if emotion_col in emotions else 0 for emotion_col in emotion_cols]
    return one_hot_emotion

def get_pred_tag_content(prediction, tag_name):
    match = re.search(fr'<{tag_name}>(.*?)</{tag_name}>', prediction, re.DOTALL)
    tag_content = match.group(1).strip() if match else None
    return tag_content

def predict(text, prompt=prompt):
    completion = client.chat.completions.create(
        model='gpt-4o',
        messages=[
            {'role': 'system', 'content': prompt},
            {'role': 'user', 'content': text},
        ],
        max_tokens=1000,
    )
    return completion.choices[0].message.content

prediction = predict("Cek urang mah jelema goblog teh nyaeta, jelema nu menta udud ngomong teu boga, padahal Aya sabungkus dinu pesak 😂🤣")

keywords_pred = get_pred_tag_content(prediction, 'keywords')
print("Precicted keywords and their analyses:")
print(keywords_pred)
print()

emotions_pred = get_pred_tag_content(prediction, 'emotions').replace(" ", "").split(",")
print("Precicted emotions:", emotions_pred, "->", one_hot_encode_emotions(emotions_pred))

Precicted keywords and their analyses:
<keyword>
- Keyword: goblog
- Analysis: The word "goblog" translates to "stupid" or "foolish" in English. In the context of the sentence, it is used to insult someone by calling them a fool. According to the guidelines, insults typically indicate both `anger` and `disgust`. This is because calling someone foolish generally carries a negative connotation and suggests frustration or contempt towards the person being referred to. There isn't any guideline that directly associates the word "goblog" with any other emotion like `joy`, `fear`, `sadness`, or `surprise`, which reinforces the association to `anger` and `disgust` primarily. The usage of "goblog" here seems intentional to express a critical opinion about someone's behavior, underlying a sense of impatience or annoyance as well, which aligns with `anger`.
- Emotions: anger, disgust
</keyword>

<keyword>
- Keyword: 😂🤣
- Analysis: The emojis 😂 (face with tears of joy) and 🤣 (rolling on the floor

In [349]:
def eval_sampled(n):
    df_sampled = df.sample(n=n)
    y_true = df_sampled[emotion_cols].to_numpy()

    y_pred = []
    for i, v in enumerate(df_sampled.iterrows()):
        idx, row = v

        print("=" * 64)
        print(f"{i+1} of {len(df_sampled)}")
        print("=" * 64)

        is_valid = True

        while True:
            print("Predicting...")        

            prediction = predict(row['text'])
            keywords_pred = get_pred_tag_content(prediction, 'keywords')
            emotions_pred = get_pred_tag_content(prediction, 'emotions').replace(" ", "").split(",")
            emotions_pred_one_hot = one_hot_encode_emotions(emotions_pred, emotion_cols)

            is_valid = bool(sum(emotions_pred_one_hot))

            if is_valid:
                break
            else:
                print("Error! Prediction:", prediction)
                print()
            
        y_pred.append(emotions_pred_one_hot)

        print()
        print("# TEXT")
        print(row['text'])
        print()
        print("# KEYWORDS")
        print(keywords_pred)
        print()
        print("# PREDICTED EMOTIONS")
        print(emotions_pred)
        print()
        print("# TRUE EMOTIONS")
        print(row['emotion'])
        print()

    f1_macro = f1_score(y_true, y_pred, average='macro', zero_division=1.0)
    print("F1 Macro:", f1_macro)

eval_sampled(n=20)

1 of 20
Predicting...

# TEXT
Mang pas mang dana nyebutkeun "tah urang inget" langsung aya iklan gojek mang, saya ge jadi kainget naha teu make gofood, 😁

# KEYWORDS
<keyword>
- Keyword: 😁
- Analysis: Ah, the smile emoji! Now, this is something that usually indicates an emotion like joy or happiness. Let me think about the context here. The person is recalling an advertisement for Gojek right after someone mentioned "tah urang inget." This likely triggers a light-hearted or humorous memory. The presence of the smile emoji reinforces that this situation is seen in a positive or humorous light, rather than anything negative. The smile emoji is a strong indicator, according to the guidelines, of joy. So, even if the rest of the context might suggest something else, unless it's overwhelmingly negative or sad, the smile typically wins out for joy here. I can deduce that the person seems to find something amusing or enjoyable about the situation, perhaps the coincidence of remembering Gojek’

## Save Evaluation

In [None]:
# save_df = df.copy()
# save_df[emotion_cols] = y_pred
# save_df.to_csv('eval_sun.csv', index=False)
# print("Saved to: eval_sun.csv")