# Libraries

In [17]:
import json
import re
import pandas as pd
from openai import OpenAI
from sklearn.metrics import f1_score
# from pprint import pprint

pd.set_option('max_colwidth', None)

# Config

In [18]:
seed = 42
client = OpenAI(api_key=json.load(open('secrets.json'))['OPENAPI_SECRET_KEY'])
idn2eng_emotion_map = {
    'marah': 'anger',
    'jijik': 'disgust',
    'takut': 'fear',
    'senang': 'joy',
    'sedih': 'sadness',
    'terkejut': 'surprise',
    'biasa': 'neutral',
}
prompt_paths = [
    'prompts/prompt_with_keywords_and_analysis.txt',
    'prompts/prompts_with_keywords.txt',
]

# Data

## Load Data

In [19]:
df = pd.read_csv('data/preprocessed_data/track_a/sun_70_15_15_stratify_v2/val.csv')
df = df.rename(columns=idn2eng_emotion_map) # Translate emotion columns from Indonesian to English

print("DF size:", len(df))
df.head()

DF size: 128


Unnamed: 0,text,emotion,anger,disgust,fear,joy,sadness,surprise,neutral
0,alhamdulillah hatur nuhun otw nonton ieu mah,senang,0,0,0,1,0,0,0
1,"Nongton iklan nepi ka tamat ,😭",senang,0,0,0,1,0,0,0
2,min ai mnh teu cape lulumpatan wae :v kumaha weh ngagawe keun jelma mah ceuk mang dana ge :v,"senang, terkejut",0,0,0,1,0,1,0
3,"Hebaaat , mang mung basa sundana rada rapihken ulah babeulit, nya, nya","senang, terkejut",0,0,0,1,0,1,0
4,Aya meri dina rakit Boboko wadah bakatul Lain nyeri ku panyakit Kabogoh direbut batur🤣🤣😭,"senang, sedih, terkejut",0,0,0,1,1,1,0


In [20]:
# Remove data with 'neutral' emotion and the 'neutral' column
if 'neutral' in df.columns:
    df = df[~(df['neutral'] == 1)]  
    df = df.drop('neutral', axis=1) 

# Store emotion columns
emotion_cols = [col for col in df.columns if col not in ['Unnamed: 0', 'text', 'emotion', 'id']]
df['emotion'] = df.apply(lambda row: ', '.join([emotion for emotion in emotion_cols if row[emotion] == 1]), axis=1)

print("DF size (after removing emotion 'neutral'):", len(df))
print("Emotion columns:", emotion_cols)
df.head()

DF size (after removing emotion 'neutral'): 121
Emotion columns: ['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']


Unnamed: 0,text,emotion,anger,disgust,fear,joy,sadness,surprise
0,alhamdulillah hatur nuhun otw nonton ieu mah,joy,0,0,0,1,0,0
1,"Nongton iklan nepi ka tamat ,😭",joy,0,0,0,1,0,0
2,min ai mnh teu cape lulumpatan wae :v kumaha weh ngagawe keun jelma mah ceuk mang dana ge :v,"joy, surprise",0,0,0,1,0,1
3,"Hebaaat , mang mung basa sundana rada rapihken ulah babeulit, nya, nya","joy, surprise",0,0,0,1,0,1
4,Aya meri dina rakit Boboko wadah bakatul Lain nyeri ku panyakit Kabogoh direbut batur🤣🤣😭,"joy, sadness, surprise",0,0,0,1,1,1


# Evaluation

In [31]:
def get_prompt(prompt_path):
    prompt = None
    with open(prompt_path, 'r') as f:
        prompt = f.read()
    return prompt

def one_hot_encode_emotions(emotions, emotion_cols=emotion_cols):
    one_hot_emotion = [1 if emotion_col in emotions else 0 for emotion_col in emotion_cols]
    return one_hot_emotion

def get_pred_tag_content(prediction, tag_name):
    match = re.search(fr'<{tag_name}>(.*?)</{tag_name}>', prediction, re.DOTALL)
    tag_content = match.group(1).strip() if match else None
    return tag_content

def predict(text, prompt):
    completion = client.chat.completions.create(
        model='gpt-4o',
        messages=[
            {'role': 'system', 'content': prompt},
            {'role': 'user', 'content': text},
        ],
        max_tokens=1000,
    )
    return completion.choices[0].message.content

prediction = predict(
    text="Cek urang mah jelema goblog teh nyaeta, jelema nu menta udud ngomong teu boga, padahal Aya sabungkus dinu pesak 😂🤣",
    prompt=get_prompt(prompt_paths[0]),
)

keywords_pred = get_pred_tag_content(prediction, 'keywords')
print("Precicted keywords and their analyses:")
print(keywords_pred)
print()

emotions_pred = get_pred_tag_content(prediction, 'emotions').lower().replace("." , "").replace(" ", "").split(",")
print("Precicted emotions:", emotions_pred, "->", one_hot_encode_emotions(emotions_pred))

Precicted keywords and their analyses:
<keyword>
- Keyword: goblog
- Analysis: The word "goblog" in Sundanese translates to "stupid" or "idiot" in English. This term is typically used as an insult, and according to the guidelines, insults are generally associated with both `anger` and `disgust`. Using such a term suggests that the speaker is expressing a strong negative emotion towards the person they are referring to, possibly because they believe the person's actions do not make sense. It is an expression that reflects a frustration or disdain for perceived foolishness or incompetence.
- Emotions: anger, disgust
</keyword>

<keyword>
- Keyword: 😂🤣
- Analysis: The emojis 😂 and 🤣 are both representations of laughter and are typically used to indicate something is extremely funny or amusing. Referring to the guidelines, laughter-like expressions, especially emojis of this nature, usually indicate `joy`. In the context of the sentence, the use of these emojis suggests that the speaker fi

In [39]:
def eval_sampled(n, prompt):
    df_sampled = df.sample(n=n)
    y_true = df_sampled[emotion_cols].to_numpy()

    y_pred = []
    for i, v in enumerate(df_sampled.iterrows()):
        idx, row = v

        print("=" * 64)
        print(f"{i+1} OF {len(df_sampled)}")
        print("=" * 64)

        is_valid = True

        while True:
            print("Predicting...")        

            prediction = predict(row['text'], prompt)
            keywords_pred = get_pred_tag_content(prediction, 'keywords')
            emotions_pred = get_pred_tag_content(prediction, 'emotions').replace(" ", "").split(",")
            emotions_pred_one_hot = one_hot_encode_emotions(emotions_pred, emotion_cols)

            is_valid = bool(sum(emotions_pred_one_hot))

            if is_valid:
                break
            else:
                print("Error! Prediction:", prediction)
                print()
            
        y_pred.append(emotions_pred_one_hot)

        print()
        print("# TEXT")
        print(row['text'])
        print()
        print("# KEYWORDS")
        print(keywords_pred)
        print()
        print("# PREDICTED EMOTIONS")
        print(emotions_pred)
        print()
        print("# TRUE EMOTIONS")
        print(row['emotion'])
        print()

    f1_macro = f1_score(y_true, y_pred, average='macro', zero_division=1.0)
    f1_micro = f1_score(y_true, y_pred, average='micro', zero_division=1.0)
    f1_labels = f1_score(y_true, y_pred, average=None, zero_division=1.0)
    f1_labels_dict = {f'f1_label_{emotion_cols[i]}': float(f1_labels[i]) for i in range(len(f1_labels))}

    return {
        'f1_macro': float(f1_macro),
        'f1_micro': float(f1_micro),
        **f1_labels_dict,
    }

prompt_evaluations = []
for prompt_path in prompt_paths:
    print("=" * 128)
    print("PROMPT PATH:", prompt_path)
    print("=" * 128)

    evaluation = eval_sampled(n=20, prompt=get_prompt(prompt_path))
    print("Evaluation:")
    print(evaluation)

    prompt_evaluations.append({prompt_path: evaluation})
    
    print()

PROMPT PATH: prompts/prompt_with_keywords_and_analysis.txt
1 OF 20
Predicting...

# TEXT
Geuning geus numbub deui kumisna wkwkwk

# KEYWORDS
<keyword>
- Keyword: wkwkwk
- Analysis: The text includes the expression "wkwkwk," which commonly represents the sound of laughter in casual online communication, particularly in Indonesian and Sundanese cultures. This expression is typically used to indicate amusement or humor in a conversation. According to the guidelines, laughter-like expressions such as "hahaha," "wkwkw," or similar suggest the emotion of joy. The presence of "wkwkwk" in the sentence implies a light-hearted or humorous reaction to the preceding statement about someone's mustache having grown back. The use of "wkwkwk" is not associated with any negative emotions such as anger, sadness, or fear. Instead, it indicates that the speaker finds the situation funny or amusing. Therefore, this laughter expression points towards the overall emotion being joy in this context, as it sugg

In [40]:
prompt_evaluations

[{'prompts/prompt_with_keywords_and_analysis.txt': {'f1_macro': 0.5017281105990784,
   'f1_micro': 0.7272727272727273,
   'f1_label_anger': 1.0,
   'f1_label_disgust': 0.0,
   'f1_label_fear': 0.0,
   'f1_label_joy': 0.9032258064516129,
   'f1_label_sadness': 0.8571428571428571,
   'f1_label_surprise': 0.25}},
 {'prompts/prompts_with_keywords.txt': {'f1_macro': 0.7271524771524772,
   'f1_micro': 0.7843137254901961,
   'f1_label_anger': 1.0,
   'f1_label_disgust': 1.0,
   'f1_label_fear': 0.0,
   'f1_label_joy': 0.9285714285714286,
   'f1_label_sadness': 0.8888888888888888,
   'f1_label_surprise': 0.5454545454545454}}]

## Save Evaluation

In [None]:
# save_df = df.copy()
# save_df[emotion_cols] = y_pred
# save_df.to_csv('eval_sun.csv', index=False)
# print("Saved to: eval_sun.csv")