# Libraries

In [11]:
# import os
import re
import json
import pandas as pd
from openai import OpenAI
# from datasets import load_dataset
# from sklearn.metrics import f1_score
# from pprint import pprint

pd.set_option('max_colwidth', None)

# Config

In [2]:
seed = 42
client = OpenAI(api_key=json.load(open('secrets.json'))['OPENAPI_SECRET_KEY'])
prompt_path = 'prompts/prompt_with_keywords.txt'

# Data

## Load Data

In [3]:
# df = pd.read_csv('data/public_data_test/track_a/test/sun.csv')
# # df = pd.read_csv('data/preprocessed_data/track_a/sun_70_15_15_stratify_v2/val.csv')
# df = df.rename(columns=idn2eng_emotion_map)
# emotion_cols = [col for col in df.columns if col not in ['Unnamed: 0', 'text', 'emotion', 'id']]
# df['emotion'] = df.apply(lambda row: ', '.join([emotion for emotion in emotion_cols if row[emotion] == 1]), axis=1)

# print("DF size:", len(df))
# print("Emotion columns:", emotion_cols)
# df.head()

In [4]:
df = pd.read_csv('data/public_data_test/track_a/test/sun.csv')

print("DF size:", len(df))
df.head()

DF size: 926


Unnamed: 0,id,text,anger,disgust,fear,joy,sadness,surprise
0,sun_test_track_a_00001,Kualitas urg sunda emg teu pernah ngaboong👍,,,,,,
1,sun_test_track_a_00002,Sampe sakit perut aduh akang teu kuat 🤣,,,,,,
2,sun_test_track_a_00003,Aduh ey urang telat nonton kumaha atuh,,,,,,
3,sun_test_track_a_00004,mang nyieun deui video anu kocak,,,,,,
4,sun_test_track_a_00005,Kang abdi rek meli baju urang sunda cimahi di lewigajah taman bukit cibogo no a1 rt tilu,,,,,,


In [5]:
# Store emotion columns
emotion_cols = [col for col in df.columns if col not in ['Unnamed: 0', 'text', 'emotion', 'id']]

print("Emotion columns:", emotion_cols)

Emotion columns: ['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']


# Submission

## Create Submission

In [6]:
def get_prompt(prompt_path):
    prompt = None
    with open(prompt_path, 'r') as f:
        prompt = f.read()
    return prompt

def one_hot_encode_emotions(emotions, emotion_cols=emotion_cols):
    one_hot_emotion = [1 if emotion_col in emotions else 0 for emotion_col in emotion_cols]
    return one_hot_emotion

def get_pred_tag_content(prediction, tag_name):
    match = re.search(fr'<{tag_name}>(.*?)</{tag_name}>', prediction, re.DOTALL)
    tag_content = match.group(1).strip() if match else None
    return tag_content

def predict(text, prompt):
    completion = client.chat.completions.create(
        model='gpt-4o',
        messages=[
            {'role': 'system', 'content': prompt},
            {'role': 'user', 'content': text},
        ],
        max_tokens=1000,
    )
    return completion.choices[0].message.content

prediction = predict(
    text="Cek urang mah jelema goblog teh nyaeta, jelema nu menta udud ngomong teu boga, padahal Aya sabungkus dinu pesak 😂🤣",
    prompt=get_prompt(prompt_path),
)

keywords_pred = get_pred_tag_content(prediction, 'keywords')
print("Precicted keywords and their analyses:")
print(keywords_pred)
print()

emotions_pred = get_pred_tag_content(prediction, 'emotions').lower().replace("." , "").replace(" ", "").split(",")
print("Precicted emotions:", emotions_pred, "->", one_hot_encode_emotions(emotions_pred))

Precicted keywords and their analyses:
- goblog -> anger (insulting term for stupidity)
- 😂 -> joy (laughter emoji)
- 🤣 -> joy (laughter emoji)

Precicted emotions: ['anger', 'joy'] -> [1, 0, 0, 1, 0, 0]


In [7]:
prompt = get_prompt(prompt_path)

y_pred = []
for i, v in enumerate(df.iterrows()):
    idx, row = v

    print("=" * 64)
    print(f"{i+1} OF {len(df)}")
    print("=" * 64)

    is_valid = True

    while True:
        print("Predicting...")        

        prediction = predict(row['text'], prompt)
        keywords_pred = get_pred_tag_content(prediction, 'keywords')
        emotions_pred = get_pred_tag_content(prediction, 'emotions').replace(" ", "").split(",")
        emotions_pred_one_hot = one_hot_encode_emotions(emotions_pred, emotion_cols)

        is_valid = bool(sum(emotions_pred_one_hot))

        if is_valid:
            break
        else:
            print("Error! Prediction:", prediction)
            print()
        
    y_pred.append(emotions_pred_one_hot)

    print()
    print("# TEXT")
    print(row['text'])
    print()
    print("# KEYWORDS")
    print(keywords_pred)
    print()
    print("# PREDICTED EMOTIONS")
    print(emotions_pred)
    print()

print("=" * 128)
print("PROMPT PATH:", prompt_path)
print("=" * 128)

print("Predicted Y:")
print(y_pred)

1 OF 926
Predicting...

# TEXT
Kualitas urg sunda emg teu pernah ngaboong👍

# KEYWORDS
- teu pernah ngaboong -> joy (indicating reliability and truthfulness)
- 👍 -> joy (positive gesture, indicating approval or agreement)

# PREDICTED EMOTIONS
['joy']

2 OF 926
Predicting...

# TEXT
Sampe sakit perut aduh akang teu kuat 🤣

# KEYWORDS
- sakit perut -> joy (expression in a humorous context)
- 🤣 -> joy (laughter emoji)

# PREDICTED EMOTIONS
['joy']

3 OF 926
Predicting...

# TEXT
Aduh ey urang telat nonton kumaha atuh

# KEYWORDS
- telat -> sadness (being late can cause disappointment or regret)

# PREDICTED EMOTIONS
['sadness']

4 OF 926
Predicting...

# TEXT
mang nyieun deui video anu kocak

# KEYWORDS
- kocak -> joy (humorous and funny content)

# PREDICTED EMOTIONS
['joy']

5 OF 926
Predicting...

# TEXT
Kang abdi rek meli baju urang sunda cimahi di lewigajah taman bukit cibogo no a1 rt tilu

# KEYWORDS
- Not enough emotion-related keywords found in this text.

# PREDICTED EMOTIONS
['

## Save Submission

In [8]:
df[emotion_cols] = y_pred
df.head()

Unnamed: 0,id,text,anger,disgust,fear,joy,sadness,surprise
0,sun_test_track_a_00001,Kualitas urg sunda emg teu pernah ngaboong👍,0,0,0,1,0,0
1,sun_test_track_a_00002,Sampe sakit perut aduh akang teu kuat 🤣,0,0,0,1,0,0
2,sun_test_track_a_00003,Aduh ey urang telat nonton kumaha atuh,0,0,0,0,1,0
3,sun_test_track_a_00004,mang nyieun deui video anu kocak,0,0,0,1,0,0
4,sun_test_track_a_00005,Kang abdi rek meli baju urang sunda cimahi di lewigajah taman bukit cibogo no a1 rt tilu,0,0,0,1,0,0


In [9]:
df[['id'] + emotion_cols].to_csv('pred_sun_test_gpt4o_10.csv', index=False)