# Libraries

In [17]:
# import os
import json
import pandas as pd
from openai import OpenAI
from datasets import load_dataset
from sklearn.metrics import f1_score
from pprint import pprint

pd.set_option('max_colwidth', None)

# Config

In [18]:
seed = 42

client = OpenAI(api_key=json.load(open('secrets.json'))['OPENAPI_SECRET_KEY'])

idn2eng_emotion_map = {
    'marah': 'anger',
    'jijik': 'disgust',
    'takut': 'fear',
    'senang': 'joy',
    'sedih': 'sadness',
    'terkejut': 'surprise',
    'biasa': 'neutral',
}

# Data

## Load Data

In [19]:
# df = pd.read_csv('data/public_data_dev/track_a/train/sun.csv')
df = pd.read_csv('data/preprocessed_data/track_a/sun_70_15_15_stratify_v2/val.csv')
df = df.rename(columns=idn2eng_emotion_map)
emotion_cols = [col for col in df.columns if col not in ['Unnamed: 0', 'text', 'emotion', 'id']]
df['emotion'] = df.apply(lambda row: ', '.join([emotion for emotion in emotion_cols if row[emotion] == 1]), axis=1)

print("DF size:", len(df))
print("Emotion columns:", emotion_cols)
df.head()

DF size: 128
Emotion columns: ['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'neutral']


Unnamed: 0,text,emotion,anger,disgust,fear,joy,sadness,surprise,neutral
0,alhamdulillah hatur nuhun otw nonton ieu mah,joy,0,0,0,1,0,0,0
1,"Nongton iklan nepi ka tamat ,😭",joy,0,0,0,1,0,0,0
2,min ai mnh teu cape lulumpatan wae :v kumaha weh ngagawe keun jelma mah ceuk mang dana ge :v,"joy, surprise",0,0,0,1,0,1,0
3,"Hebaaat , mang mung basa sundana rada rapihken ulah babeulit, nya, nya","joy, surprise",0,0,0,1,0,1,0
4,Aya meri dina rakit Boboko wadah bakatul Lain nyeri ku panyakit Kabogoh direbut batur🤣🤣😭,"joy, sadness, surprise",0,0,0,1,1,1,0


In [20]:
# Omit data with 'emotion' "neutral"
df = df[~(df['emotion'] == 'neutral')]
print("DF size:", len(df))

DF size: 121


In [83]:
# prompt = """# INSTRUCTION
# Identify the EMOTION(S) expressed in the TEXT. 
# This is a multi-label classification task, meaning the identified emotion(s) may include one or more labels from the predefined EMOTION LABELS below.

# ## EMOTION LABELS 
# - anger
# - fear
# - joy
# - sadness
# - surprise

# ## STEPS TO IDENTIFY THE EMOTION(S)
# 1. Extract all KEYWORD(S) that carry significant emotion-related information in the TEXT. For each `keyword`, specify the `related emotion` from the EMOTION LABELS in the format: `keyword -> related emotion`.
# 2. Based on the identified KEYWORD(S), determine the overall EMOTION(S) expressed in the TEXT.

# The OUTPUTS must consist of KEYWORD(S) and EMOTION(S).
# Only respond the KEYWORD(S) and EMOTION(S).

# # INPUT
# ## TEXT
# {text}

# # OUTPUTS
# ## KEYWORD(S)
# """

# ## TIPS
# - If the TEXT contains laughter-like expressions such as "hahaha," "wkwkw," or similar, the identified emotion(s) must at least include "joy."
# - If the TEXT contains variants of smile or laugh emojis, such as 😊, ☺️, 😃, 😁, 😆, 😇, 🥰, 😍, 😋, 🤗, 😌, the identified emotion(s) must at least include "joy."

# prompt = """# INSTRUCTION
# Identify the EMOTION(S) expressed in the TEXT. 
# This is a multi-label classification task, so there will always be one or more emotions to identify in the provided TEXT. 
# The model must output the KEYWORD(S) and the EMOTION(S) from the predefined EMOTION LABELS below. 
# The model must always return a result—there should never be an instance with no identified emotion(s).

# ## EMOTION LABELS 
# - anger
# - fear
# - joy
# - sadness
# - surprise

# ## STEPS TO IDENTIFY THE EMOTION(S)
# 1. Extract all KEYWORD(S) that carry significant emotion-related information in the TEXT. For each `keyword`, specify the `related emotion` from the EMOTION LABELS in the format: `keyword -> related emotion (reason)`.
# 2. Based on the identified KEYWORD(S), determine the overall EMOTION(S) expressed in the TEXT.

# ## TIPS
# - If the TEXT contains laughter-like expressions such as "hahaha," "wkwkw," or similar, the identified emotion(s) often include "joy."
# - If the TEXT contains variants of smile or laugh emojis, such as 😊, ☺️, 😃, 😁, 😆, 😇, 🥰, 😍, 😋, 🤗, 😌, the identified emotion(s) usually include "joy."
# - If the TEXT contains insults, the identified emotion(s) usually include both "disgust" and "anger."
# - In Sundanese, "NU" is commonly used to mean "yang" (in Indonesian), not "Nahdlatul Ulama." The model must interpret "NU" accordingly based on the context.

# The OUTPUTS must include the KEYWORD(S) and the EMOTION(S).
# Do not omit any emotion expressed in the TEXT.

# # INPUT
# ## TEXT
# {text}

# # OUTPUTS
# ## KEYWORD(S)
# """

# prompt = """# INSTRUCTION
# Identify the EMOTION(S) expressed in the TEXT. 
# This is a multi-label classification task, so there will always be one or more emotions to identify in the provided TEXT. 
# The model must output the KEYWORD(S) and the EMOTION(S) from the predefined EMOTION LABELS below. 
# The model must always return a result—there should never be an instance with no identified emotion(s).

# ## EMOTION LABELS 
# - anger
# - fear
# - joy
# - sadness
# - surprise

# ## STEPS TO IDENTIFY THE EMOTION(S)
# 1. Extract all KEYWORD(S) that carry significant emotion-related information in the TEXT. For each `keyword`, specify the `related emotion` from the EMOTION LABELS in the format: `keyword -> related emotion (reason)`.
# 2. Based on the identified KEYWORD(S), determine the overall EMOTION(S) expressed in the TEXT.

# ## TIPS
# - If the TEXT contains laughter-like expressions such as "hahaha," "wkwkw," or similar, the identified emotion(s) often include "joy."
# - If the TEXT contains variants of smile or laugh emojis, such as 😊, ☺️, 😃, 😁, 😆, 😇, 🥰, 😍, 😋, 🤗, 😌, the identified emotion(s) usually include "joy."
# - If the TEXT contains insults, the identified emotion(s) usually include both "disgust" and "anger."
# - In Sundanese, "NU" is commonly used to mean "yang" (in Indonesian), not "Nahdlatul Ulama." The model must interpret "NU" accordingly based on the context.
# - The emoji 😭 does not always indicate "sadness". It can sometimes represent extreme laughter that brings tears. In such cases, the identified emotion(s) should include "joy."

# The OUTPUTS must include the KEYWORD(S) and the EMOTION(S).
# Do not omit any emotion expressed in the TEXT.

# # INPUT
# ## TEXT
# {text}

# # OUTPUTS
# ## KEYWORD(S)
# """

# prompt = """# INSTRUCTION
# Identify the EMOTION(S) expressed in the TEXT. 
# This is a multi-label classification task, so there will always be one or more emotions to identify in the provided TEXT. 
# The model must output the KEYWORD(S) and the EMOTION(S) from the predefined EMOTION LABELS below. 
# The model must always return a result—there should never be an instance with no identified emotion(s).

# ## EMOTION LABELS 
# - anger
# - fear
# - joy
# - sadness
# - surprise

# ## STEPS TO IDENTIFY THE EMOTION(S)
# 1. Extract all KEYWORD(S) that carry significant emotion-related information in the TEXT. For each `keyword`, specify the `related emotion` from the EMOTION LABELS in the format: `keyword -> related emotion (reason)`.
# 2. Based on the identified KEYWORD(S), determine the overall EMOTION(S) expressed in the TEXT.

# ## TIPS
# - If the TEXT contains laughter-like expressions such as "hahaha," "wkwkw," or similar, the identified emotion(s) often include "joy."
# - If the TEXT contains variants of smile or laugh emojis, such as 😊, ☺️, 😃, 😁, 😆, 😇, 🥰, 😍, 😋, 🤗, 😌, the identified emotion(s) usually include "joy."
# - If the TEXT contains insults, the identified emotion(s) usually include both "disgust" and "anger."
# - In Sundanese, "NU" is commonly used to mean "yang" (in Indonesian), not "Nahdlatul Ulama." The model must interpret "NU" accordingly based on the context.
# - The emoji 😭 does not always indicate "sadness". It can sometimes represent extreme laughter that brings tears. In such cases, the identified emotion(s) should include "joy."

# The OUTPUTS must include the KEYWORD(S) and the EMOTION(S).
# Do not omit any emotion expressed in the TEXT.

# # INPUT
# ## TEXT
# {text}

# # OUTPUTS
# ## KEYWORD(S)
# """

prompt = """# INSTRUCTION
Identify the EMOTION(S) expressed in the TEXT. 
This is a multi-label classification task, meaning there will always be one or more emotions to identify in the TEXT. 
The model must output the EMOTION(S) from the predefined EMOTION LABELS listed below. 
The model must always return a result—there should never be an instance with no identified EMOTION(s).

## EMOTION LABELS
```
- anger
- fear
- joy
- sadness
- surprise
```

## STEPS TO IDENTIFY THE EMOTION(S)
1. Guess the CONTEXT of the TEXT by analyzing its overall meaning and tone. Provide a concise description of the CONTEXT to clarify the emotional environment in which the TEXT is situated.
2. Using the guessed CONTEXT, Extract all KEYWORD(S) that carry significant emotion-related information in the TEXT. For each `keyword`, specify the `related emotion` from the EMOTION LABELS in the format: `keyword -> related emotion (reason)`. Ensure that the extracted KEYWORD(S) align with the described CONTEXT to maintain consistency in interpretation.
3. Based on the identified KEYWORD(S), determine the overall EMOTION(S) expressed in the TEXT.

## TIPS
- Laughter-like expressions, such as "hahaha," "wkwkw," or similar, typically indicate `joy`.
- Smile or laugh emojis, such as 😊, ☺️, 😃, 😁, 😆, 😇, 🥰, 😍, 😋, 🤗, 😌, typically indicate `joy`.
- Insults typically indicate `anger` and may also suggest `disgust`.
- The emoji 😭 does not always indicate `sadness`. In humorous contexts, it can represent extreme laughter that brings tears, indicating `joy`.
- In Sundanese, "NU" often means "yang" in Indonesian, not "Nahdlatul Ulama". Always interpret "NU" based on the context.

The OUTPUTS must include the CONTEXT, KEYWORD(S) and EMOTION(S).
Do not omit any emotion expressed in the TEXT.

# INPUT
## TEXT
{text}

# OUTPUTS
## CONTEXT
"""

In [81]:
def predict(text):
    completion = client.chat.completions.create(
        model="gpt-4o",
        # model="gpt-4o-mini",
        messages=[
            # {"role": "system", "content": "You are a helpful assistant."},
            {
                "role": "user",
                'content': prompt.format(text=text),
            },
        ],
        max_tokens=300,
    )
    return completion.choices[0].message.content

def one_hot_encode_emotions(emotions, emotion_cols):
    one_hot_emotion = [1 if emotion_col in emotions else 0 for emotion_col in emotion_cols]
    return one_hot_emotion

In [27]:
# df_sampled = df.sample(n=20)
# y_true = df_sampled[emotion_cols].to_numpy()

# y_pred = []
# for i, v in enumerate(df_sampled.iterrows()):
#     idx, row = v

#     print("=" * 64)
#     print(f"{i+1} of {len(df_sampled)}")
#     print("=" * 64)

#     is_valid = True

#     while True:
#         print("Predicting...")        

#         output = predict(row['text'])
#         ctx = prompt + output

#         keywords_emotions = ctx.split("# KEYWORD(S)")[-1].split("## EMOTION(S)")
#         # keywords = keywords_emotions[0].strip().split("\n")
#         keywords = keywords_emotions[0].strip()
#         emotions = [emotion.replace("-", "").strip() for emotion in keywords_emotions[-1].strip().split("\n")]

#         is_valid = bool(sum([int(emotion_pred in emotion_cols) for emotion_pred in emotions]))

#         if is_valid:
#             break
#         else:
#             print("ERROR:", output)
#             print()
        
#     emotions_one_hot = one_hot_encode_emotions(emotions, emotion_cols)
#     y_pred.append(emotions_one_hot)

#     print()
#     print("# TEXT")
#     print(row['text'])
#     print()
#     print("# KEYWORD(S)")
#     print(keywords)
#     print()
#     print("# PREDICTED EMOTION(S)")
#     print(emotions)
#     print()
#     print("# TRUE EMOTION(S)")
#     print(row['emotion'])
#     print()
#     # df.at[i, 'text_sun2idn'] = text_sun2idn

# f1_macro = f1_score(y_true, y_pred, average='macro', zero_division=1.0)
# print("F1 Macro:", f1_macro)

In [82]:
df_sampled = df.sample(n=20)
y_true = df_sampled[emotion_cols].to_numpy()

y_pred = []
for i, v in enumerate(df_sampled.iterrows()):
    idx, row = v

    print("=" * 64)
    print(f"{i+1} of {len(df_sampled)}")
    print("=" * 64)

    is_valid = True

    while True:
        print("Predicting...")        

        output = predict(row['text'])
        prompt_output = prompt + output

        outputs = prompt_output.split("# OUTPUTS")[-1]
        ctx = outputs.split("## CONTEXT")[-1].split("## KEYWORD(S)")[0].strip()
        keywords = outputs.split("## KEYWORD(S)")[-1].split("## EMOTION(S)")[0].strip()
        emotions = outputs.split("## EMOTION(S)")[-1].strip()
        emotions = [emotion.replace("-", "").strip() for emotion in emotions.split("\n")]

        is_valid = bool(sum([int(emotion_pred in emotion_cols) for emotion_pred in emotions]))

        if is_valid:
            break
        else:
            print("ERROR:", output)
            print()
        
    emotions_one_hot = one_hot_encode_emotions(emotions, emotion_cols)
    y_pred.append(emotions_one_hot)

    print()
    print("# TEXT")
    print(row['text'])
    print()
    print("# CONTEXT")
    print(ctx)
    print()
    print("# KEYWORD(S)")
    print(keywords)
    print()
    print("# PREDICTED EMOTION(S)")
    print(emotions)
    print()
    print("# TRUE EMOTION(S)")
    print(row['emotion'])
    print()
    # df.at[i, 'text_sun2idn'] = text_sun2idn

f1_macro = f1_score(y_true, y_pred, average='macro', zero_division=1.0)
print("F1 Macro:", f1_macro)

1 of 20
Predicting...

# TEXT
pokonamah nuhun sabandungeun , kita terus tingkatkeun skill babarengan mang :) KAPAN NIKAH ? :(

# CONTEXT
The TEXT expresses gratitude and encouragement for skill improvement, followed by a sudden shift to a question about marriage, which adds an element of surprise or frustration.

# KEYWORD(S)
- "nuhun" -> joy (expression of gratitude)
- "terus tingkatkeun skill babarengan" -> joy (encouragement and positivity)
- "KAPAN NIKAH ?" -> surprise (unexpected or abrupt question about marriage)
- ":( " -> sadness or frustration (indicating some negative feeling, possibly unease about the question)

# PREDICTED EMOTION(S)
['joy', 'surprise', 'sadness']

# TRUE EMOTION(S)
joy, sadness

2 of 20
Predicting...

# TEXT
Hmmm teu tiasa berkata² deui , sedih pmi di dangukeun lirikna , Komo bari ningal klip na duh deg pisan kana hate na teh

# CONTEXT
The TEXT expresses a somber and reflective state, where the listener is deeply moved by the song's lyrics and music video

# Create Submission

In [84]:
dev_df = pd.read_csv('data/public_data_dev/track_a/dev/sun.csv')
emotion_cols = [col for col in dev_df.columns if col not in ['Unnamed: 0', 'text', 'emotion', 'id']]

print("Dev DF size:", len(dev_df))
print("Emotion columns:", emotion_cols)
dev_df.head()

Dev DF size: 199
Emotion columns: ['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']


Unnamed: 0,id,text,anger,disgust,fear,joy,sadness,surprise
0,sun_dev_track_a_00001,"Ajigg kalah hyg seuri ku musik na, asa asing hhhha",,,,,,
1,sun_dev_track_a_00002,2:37 Uyuhan teu ti Jungkeul oge eta si aa na 🤣,,,,,,
2,sun_dev_track_a_00003,Anu Matak Mun di Imah kosong tong ngomong punten bisi Aya NU nembalan,,,,,,
3,sun_dev_track_a_00004,Beh Naha topi na teu di pake ? Asa teu mernah,,,,,,
4,sun_dev_track_a_00005,Ai istrina atos sabaraha bulan mang?,,,,,,


In [92]:
y_pred = []
for i, v in enumerate(dev_df.iterrows()):
# for i, v in enumerate(dev_df[len(y_pred):].iterrows()):
    idx, row = v

    print("=" * 64)
    print(f"{i+1} of {len(dev_df)}")
    print("=" * 64)

    is_valid = True

    while True:
        print("Predicting...")        

        output = predict(row['text'])
        ctx = prompt + output

        keywords_emotions = ctx.split("# KEYWORD(S)")[-1].split("## EMOTION(S)")
        # keywords = keywords_emotions[0].strip().split("\n")
        keywords = keywords_emotions[0].strip()
        emotions = [emotion.replace("-", "").strip() for emotion in keywords_emotions[-1].strip().split("\n")]

        is_valid = bool(sum([int(emotion_pred in emotion_cols) for emotion_pred in emotions]))

        if is_valid:
            break
        else:
            print("ERROR:", output)
            print()
        
    emotions_one_hot = one_hot_encode_emotions(emotions, emotion_cols)
    y_pred.append(emotions_one_hot)

    print()
    print("# TEXT")
    print(row['text'])
    print()
    print("# KEYWORD(S)")
    print(keywords)
    print()
    print("# PREDICTED EMOTION(S)")
    print(emotions)
    print()
    if 'emotion' in dev_df.columns:
        print("# TRUE EMOTION(S)")
        print(row['emotion'])
    print()

print(y_pred)

1 of 199
Predicting...

# TEXT
Ahh edaaan teu apal, mun apal miluan 😭

# KEYWORD(S)
# INSTRUCTION
Identify the EMOTION(S) expressed in the TEXT. 
This is a multi-label classification task, meaning there will always be one or more emotions to identify in the TEXT. 
The model must output the EMOTION(S) from the predefined EMOTION LABELS listed below. 
The model must always return a result—there should never be an instance with no identified EMOTION(s).

## EMOTION LABELS
```
- anger
- fear
- joy
- sadness
- surprise
```

## STEPS TO IDENTIFY THE EMOTION(S)
1. Guess the CONTEXT of the TEXT by analyzing its overall meaning and tone. Provide a concise description of the CONTEXT to clarify the emotional environment in which the TEXT is situated.
2. Using the guessed CONTEXT, Extract all KEYWORD(S) that carry significant emotion-related information in the TEXT. For each `keyword`, specify the `related emotion` from the EMOTION LABELS in the format: `keyword -> related emotion (reason)`. Ensur

In [94]:
dev_df[emotion_cols] = y_pred

In [96]:
dev_df[['id'] + emotion_cols].to_csv('pred_sun.csv', index=False)