# Libraries

In [11]:
import os
import json
import re
import pandas as pd
from datetime import datetime
from openai import OpenAI
from sklearn.metrics import f1_score
# from pprint import pprint

pd.set_option('max_colwidth', None)

# Config

In [12]:
seed = 42
client = OpenAI(api_key=json.load(open('secrets.json'))['OPENAPI_SECRET_KEY'])
idn2eng_emotion_map = {
    'marah': 'anger',
    'jijik': 'disgust',
    'takut': 'fear',
    'senang': 'joy',
    'sedih': 'sadness',
    'terkejut': 'surprise',
    'biasa': 'neutral',
}
prompt_paths = [
    # 'prompts/prompt_with_keywords_and_analysis.txt',
    'prompts/prompt_with_keywords.txt',
    'prompts/prompt_with_keywords_v3.txt',
    # 'prompts/prompt_with_keywords_and_intensity.txt',
    # 'prompts/prompt_with_keywords_v2.txt',
    # 'prompts/prompt_with_keywords_without_guidelines.txt'
]
prompt_evaluations_dir = 'prompt_evaluations'

# Data

## Load Data

In [13]:
df = pd.read_csv('data/preprocessed_data/track_a/sun_70_15_15_stratify_v2/val.csv')
df = df.rename(columns=idn2eng_emotion_map) # Translate emotion columns from Indonesian to English

print("DF size:", len(df))
df.head()

DF size: 128


Unnamed: 0,text,emotion,anger,disgust,fear,joy,sadness,surprise,neutral
0,alhamdulillah hatur nuhun otw nonton ieu mah,senang,0,0,0,1,0,0,0
1,"Nongton iklan nepi ka tamat ,😭",senang,0,0,0,1,0,0,0
2,min ai mnh teu cape lulumpatan wae :v kumaha weh ngagawe keun jelma mah ceuk mang dana ge :v,"senang, terkejut",0,0,0,1,0,1,0
3,"Hebaaat , mang mung basa sundana rada rapihken ulah babeulit, nya, nya","senang, terkejut",0,0,0,1,0,1,0
4,Aya meri dina rakit Boboko wadah bakatul Lain nyeri ku panyakit Kabogoh direbut batur🤣🤣😭,"senang, sedih, terkejut",0,0,0,1,1,1,0


In [14]:
# Remove data with 'neutral' emotion and the 'neutral' column
if 'neutral' in df.columns:
    df = df[~(df['neutral'] == 1)]  
    df = df.drop('neutral', axis=1) 

# Store emotion columns
emotion_cols = [col for col in df.columns if col not in ['Unnamed: 0', 'text', 'emotion', 'id']]
df['emotion'] = df.apply(lambda row: ', '.join([emotion for emotion in emotion_cols if row[emotion] == 1]), axis=1)

print("DF size (after removing emotion 'neutral'):", len(df))
print("Emotion columns:", emotion_cols)
df.head()

DF size (after removing emotion 'neutral'): 121
Emotion columns: ['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']


Unnamed: 0,text,emotion,anger,disgust,fear,joy,sadness,surprise
0,alhamdulillah hatur nuhun otw nonton ieu mah,joy,0,0,0,1,0,0
1,"Nongton iklan nepi ka tamat ,😭",joy,0,0,0,1,0,0
2,min ai mnh teu cape lulumpatan wae :v kumaha weh ngagawe keun jelma mah ceuk mang dana ge :v,"joy, surprise",0,0,0,1,0,1
3,"Hebaaat , mang mung basa sundana rada rapihken ulah babeulit, nya, nya","joy, surprise",0,0,0,1,0,1
4,Aya meri dina rakit Boboko wadah bakatul Lain nyeri ku panyakit Kabogoh direbut batur🤣🤣😭,"joy, sadness, surprise",0,0,0,1,1,1


# Evaluation

In [15]:
def get_prompt(prompt_path):
    prompt = None
    with open(prompt_path, 'r') as f:
        prompt = f.read()
    return prompt

def one_hot_encode_emotions(emotions, emotion_cols=emotion_cols):
    one_hot_emotion = [1 if emotion_col in emotions else 0 for emotion_col in emotion_cols]
    return one_hot_emotion

def get_pred_tag_content(prediction, tag_name):
    match = re.search(fr'<{tag_name}>(.*?)</{tag_name}>', prediction, re.DOTALL)
    tag_content = match.group(1).strip() if match else None
    return tag_content

def predict(text, prompt):
    completion = client.chat.completions.create(
        model='gpt-4o',
        messages=[
            {'role': 'system', 'content': prompt},
            {'role': 'user', 'content': text},
        ],
        max_tokens=1000,
    )
    return completion.choices[0].message.content

prediction = predict(
    text="Cek urang mah jelema goblog teh nyaeta, jelema nu menta udud ngomong teu boga, padahal Aya sabungkus dinu pesak 😂🤣",
    prompt=get_prompt(prompt_paths[0]),
)

keywords_pred = get_pred_tag_content(prediction, 'keywords')
print("Precicted keywords and their analyses:")
print(keywords_pred)
print()

emotions_pred = get_pred_tag_content(prediction, 'emotions').lower().replace("." , "").replace(" ", "").split(",")
print("Precicted emotions:", emotions_pred, "->", one_hot_encode_emotions(emotions_pred))

Precicted keywords and their analyses:
- goblog -> anger, disgust (insult towards a person)
- 😂🤣 -> joy (laughter emojis)

Precicted emotions: ['anger', 'disgust', 'joy'] -> [1, 1, 0, 1, 0, 0]


In [16]:
def eval_sampled(n, prompt_path):
    prompt = get_prompt(prompt_path)

    df_sampled = df.sample(n=n)
    y_true = df_sampled[emotion_cols].to_numpy()

    y_pred = []
    for i, v in enumerate(df_sampled.iterrows()):
        idx, row = v

        print("=" * 64)
        print(f"{i+1} OF {len(df_sampled)}")
        print("=" * 64)

        is_valid = True

        while True:
            print("Predicting...")        

            prediction = predict(row['text'], prompt)
            keywords_pred = get_pred_tag_content(prediction, 'keywords')
            emotions_pred = get_pred_tag_content(prediction, 'emotions').replace(" ", "").split(",")
            emotions_pred_one_hot = one_hot_encode_emotions(emotions_pred, emotion_cols)

            is_valid = bool(sum(emotions_pred_one_hot))

            if is_valid:
                break
            else:
                print("Error! Prediction:", prediction)
                print()
            
        y_pred.append(emotions_pred_one_hot)

        print()
        print("# TEXT")
        print(row['text'])
        print()
        print("# KEYWORDS")
        print(keywords_pred)
        print()
        print("# PREDICTED EMOTIONS")
        print(emotions_pred)
        print()
        print("# TRUE EMOTIONS")
        print(row['emotion'])
        print()

    f1_macro = f1_score(y_true, y_pred, average='macro', zero_division=1.0)
    f1_micro = f1_score(y_true, y_pred, average='micro', zero_division=1.0)
    f1_labels = f1_score(y_true, y_pred, average=None, zero_division=1.0)
    f1_labels_dict = {f'f1_label_{emotion_cols[i]}': float(f1_labels[i]) for i in range(len(f1_labels))}

    return {
        'prompt_path': prompt_path,
        'n': n,
        'f1_macro': float(f1_macro),
        'f1_micro': float(f1_micro),
        **f1_labels_dict,
    }

prompt_eval = []
for prompt_path in prompt_paths:
    print("=" * 128)
    print("PROMPT PATH:", prompt_path)
    print("=" * 128)

    evaluation = eval_sampled(n=20, prompt_path=prompt_path)
    print("Evaluation:")
    print(evaluation)

    prompt_eval.append(evaluation)
    
    print()

PROMPT PATH: prompts/prompt_with_keywords.txt
1 OF 20
Predicting...

# TEXT
Neng kunaon mundur?keur naon maju oge geus boga pamajikan, 😢

# KEYWORDS
- mundur -> sadness (indicates stepping back, possibly from a relationship or situation)
- geus boga pamajikan -> sadness (implies a reason for stepping back is due to someone being already married)
- 😢 -> sadness (crying emoji typically indicates sadness)

# PREDICTED EMOTIONS
['sadness']

# TRUE EMOTIONS
sadness, surprise

2 OF 20
Predicting...

# TEXT
Urang mantan anu di sakiti mang

# KEYWORDS
- mantan -> sadness (implies a past relationship)
- di sakiti -> sadness (indicates experiencing pain or hurt)

# PREDICTED EMOTIONS
['sadness']

# TRUE EMOTIONS
sadness

3 OF 20
Predicting...

# TEXT
Lin kadua atoh heu jadi buming deui ku 3 pemuda berbahaya lagu ieu th

# KEYWORDS
- atoh -> joy (joyful expression about the event happening again or becoming famous)
- buming -> joy (excitement or happiness about something becoming popular or a hit

In [17]:
prompt_eval

[{'prompt_path': 'prompts/prompt_with_keywords.txt',
  'n': 20,
  'f1_macro': 0.5596405228758169,
  'f1_micro': 0.7857142857142857,
  'f1_label_anger': 1.0,
  'f1_label_disgust': 0.0,
  'f1_label_fear': 0.5,
  'f1_label_joy': 0.9166666666666666,
  'f1_label_sadness': 0.9411764705882353,
  'f1_label_surprise': 0.0},
 {'prompt_path': 'prompts/prompt_with_keywords_v3.txt',
  'n': 20,
  'f1_macro': 0.6981566820276498,
  'f1_micro': 0.8085106382978723,
  'f1_label_anger': 1.0,
  'f1_label_disgust': 0.0,
  'f1_label_fear': 1.0,
  'f1_label_joy': 0.9032258064516129,
  'f1_label_sadness': 1.0,
  'f1_label_surprise': 0.2857142857142857},
 {'prompt_path': 'prompts/prompt_with_keywords_and_intensity.txt',
  'n': 20,
  'f1_macro': 0.40845204178537514,
  'f1_micro': 0.6428571428571429,
  'f1_label_anger': 0.6666666666666666,
  'f1_label_disgust': 0.0,
  'f1_label_fear': 0.0,
  'f1_label_joy': 0.8148148148148148,
  'f1_label_sadness': 0.7692307692307693,
  'f1_label_surprise': 0.2}]

## Save Evaluation

In [18]:
with open(os.path.join(prompt_evaluations_dir, f'prompt_eval_{datetime.now().strftime("%Y%m%d%H%M%S")}.json'), 'w') as f:
    json.dump(prompt_eval, f, indent=4)

## View Saved Evaluations

In [20]:
prompt_eval_names = os.listdir(prompt_evaluations_dir)
prompt_eval_dfs = [pd.read_json(os.path.join(prompt_evaluations_dir, prompt_eval_name)) for prompt_eval_name in prompt_eval_names]

concat_df = pd.concat(prompt_eval_dfs).groupby('prompt_path', as_index=False)
mean_df = concat_df.mean()
sum_df = concat_df.sum()
mean_df['n'] = sum_df['n']
mean_df

Unnamed: 0,prompt_path,n,f1_macro,f1_micro,f1_label_anger,f1_label_disgust,f1_label_fear,f1_label_joy,f1_label_sadness,f1_label_surprise
0,prompts/prompt_with_keywords.txt,280,0.564517,0.754588,0.640476,0.571429,0.178571,0.915101,0.850094,0.231432
1,prompts/prompt_with_keywords_and_analysis.txt,60,0.517154,0.70622,0.5,0.666667,0.0,0.851478,0.779221,0.305556
2,prompts/prompt_with_keywords_and_intensity.txt,40,0.437113,0.704407,0.333333,0.5,0.0,0.876157,0.813187,0.1
3,prompts/prompt_with_keywords_v2.txt,120,0.541275,0.740009,0.388889,0.666667,0.166667,0.872351,0.874038,0.27904
4,prompts/prompt_with_keywords_v3.txt,80,0.544549,0.701911,0.914286,0.166667,0.25,0.854973,0.830556,0.250812
5,prompts/prompt_with_keywords_without_guidelines.txt,40,0.517654,0.667641,0.25,1.0,0.0,0.873016,0.717949,0.264957
