# ChatGPT: Few-Shot Learning. Getting Keywords, Title, Summary
## ACL 2023 Conference
## WASSA 2023 Shared Task on Empathy, Emotion, and Personality Detection in Interactions
More details [here](https://codalab.lisn.upsaclay.fr/competitions/11167#learn_the_details)

In [1]:
import openai
import os
import re
import numpy as np
import pandas as pd
import time
import pickle
import tiktoken
import backoff
from typing import List
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, multilabel_confusion_matrix
from openai.embeddings_utils import cosine_similarity
from tqdm.autonotebook import tqdm
tqdm.pandas()

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 400)

# to see all env variables:
#for name, value in os.environ.items():
#    print("{0}: {1}".format(name, value))

  from tqdm.autonotebook import tqdm


In [2]:
def num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301"):
    '''Return number of tokens used in a list of messages for ChatGPT'''
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        #print("Warning: model not found. Using cl100k_base encoding.")
        encoding = tiktoken.get_encoding("cl100k_base")
    if model == "gpt-3.5-turbo":
        #print("Warning: gpt-3.5-turbo may change over time. Returning num tokens assuming gpt-3.5-turbo-0301.")
        return num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301")
    elif model == "gpt-4":
        #print("Warning: gpt-4 may change over time. Returning num tokens assuming gpt-4-0314.")
        return num_tokens_from_messages(messages, model="gpt-4-0314")
    elif model == "gpt-3.5-turbo-0301":
        tokens_per_message = 4  # every message follows <|start|>{role/name}\n{content}<|end|>\n
        tokens_per_name = -1  # if there's a name, the role is omitted
    elif model == "gpt-4-0314":
        tokens_per_message = 3
        tokens_per_name = 1
    else:
        raise NotImplementedError(f"""num_tokens_from_messages() is not implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens.""")
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
    return num_tokens

In [3]:
# this is just one token
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo-0301")
len(encoding.encode('####'))

1

In [4]:
def get_dummy_messages(df_, text_col):
    '''Used to determine num tokens in a text column in a dataframe'''
    return [ {'role': 'user', 'content': ' '.join(df_[text_col].tolist())} ]


# number of samples to sample per category in one iteration (for one data point in dev set)
# (determined experimentally based on ChatGPT context window size) - constant
to_sample = {
    'Sadness': 8, 'Neutral': 6, 'Anger': 4, 'Disgust': 4, 'Fear': 4, 'Hope': 4, 'Surprise': 3, 'Joy': 3,
}

In [5]:
# target variables
label2key = {   
    'Anger':    0,
    'Disgust':  1,
    'Fear':     2,
    'Hope':     3,    
    'Joy':      4,
    'Neutral':  5,
    'Sadness':  6,
    'Surprise': 7,
}
key2label = {v: k for k,v in label2key.items()}
print(key2label)

{0: 'Anger', 1: 'Disgust', 2: 'Fear', 3: 'Hope', 4: 'Joy', 5: 'Neutral', 6: 'Sadness', 7: 'Surprise'}


In [6]:
openai.api_key = os.getenv("OPENAI_API_KEY2")
model          = 'gpt-3.5-turbo'
labels_set     = set(label2key.keys())
clean = re.compile(r'[^a-zA-Z ]+')
multi_spaces = re.compile('\s{2,}')
print(labels_set)

{'Fear', 'Joy', 'Neutral', 'Sadness', 'Disgust', 'Surprise', 'Anger', 'Hope'}


In [7]:
def get_target(emotions: List[str])->List[int]:
    '''
        Convert list of strings with categories into list of 0s and 1s with length 8 because there are 8 categories;
        1 in the i-th position means that this essay belongs to the i-th category as in key2label[i]
    '''
    res  = [0]*8
    idxs = [label2key[e] for e in emotions]    
    for idx in idxs:
        res[idx] = 1
    return res

In [8]:
def verify_label(label_):
    '''
       Verify if label_ contains any of the categories
       from the predefined set of labels
    '''
    label_ = clean.sub(' ', label_)
    label_ = multi_spaces.sub(' ', label_).split()
    res    = [i for i in label_ if i in labels_set]
    res    = sorted(list(set(res)))
    return '/'.join(res) if res else None

In [9]:
def verify_num_tokens(model, messages):
    '''Check that there is enough tokens available for a ChatGPT repsonse'''
    num_tokens_tiktoken = num_tokens_from_messages(messages, model)
    if num_tokens_tiktoken > 3500:
        print(f'Number of tokens is {num_tokens_tiktoken} which exceeds 4080')        
        return False
    else:
        return True


@backoff.on_exception(backoff.expo, openai.error.RateLimitError, max_time=10)
def get_response(model, messages, temperature=0, max_tokens=None):
    '''Send request, return reponse'''
    response  = openai.ChatCompletion.create(
        model = model,
        messages = messages,
        temperature = temperature,        # range(0,2), the more the less deterministic / focused
        top_p = 1,                        # top probability mass, e.g. 0.1 = only tokens from top 10% proba mass
        n = 1,                            # number of chat completions
        #max_tokens = max_tokens,          # tokens to return
        stream = False,        
        stop=None,                        # sequence to stop generation (new line, end of text, etc.)
        )
    content = response['choices'][0]['message']['content'].strip()
    #num_tokens_api = response['usage']['prompt_tokens']
    return content

In [10]:
prompt_one = """
1 - Below you are given examples of essays with categories separated by four hashtags.
2 - Each essay has one or two relevant categories from the following list: \
Sadness, Neutral, Anger, Disgust, Fear, Hope, Surprise, Joy.
3 - Your task is to carefully learn from the examples of essays with categories in order to understand \
what features or words in the essays make them belong to a specific category and then use \
this knowledge to assign the correct relevant category from the above list to the very last essay.
4 - You may add a second category from the above list ONLY AND ONLY IF it is also relevant \
to the very last essay.
5 - Output just the category or categories for the last essay and nothing else. If there are two relevant \
categories: sort them alphabetically, concatenate with a forward slash, and output only them and nothing else.

####
"""

print(prompt_one)

# Using followup questions improves the reponse. but ChatGPT can change its mind too easily sometimes
followup = """Are you sure about that? If yes - repeat the same output, if no - change the category, \
but make sure it's from the list of predefined categories"""
print(followup)


1 - Below you are given examples of essays with categories separated by four hashtags.
2 - Each essay has one or two relevant categories from the following list: Sadness, Neutral, Anger, Disgust, Fear, Hope, Surprise, Joy.
3 - Your task is to carefully learn from the examples of essays with categories in order to understand what features or words in the essays make them belong to a specific category and then use this knowledge to assign the correct relevant category from the above list to the very last essay.
4 - You may add a second category from the above list ONLY AND ONLY IF it is also relevant to the very last essay.
5 - Output just the category or categories for the last essay and nothing else. If there are two relevant categories: sort them alphabetically, concatenate with a forward slash, and output only them and nothing else.

####

Are you sure about that? If yes - repeat the same output, if no - change the category, but make sure it's from the list of predefined categories


In [11]:
random_state = 47

# Load data

In [12]:
file1    = 'data/df_train.pkl'
df_train = pd.read_pickle(file1)

file2    = 'data/df_dev.pkl'
df_dev   = pd.read_pickle(file2)

print(df_train.shape, df_dev.shape)

2023-05-08 14:57:22.615269: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


(792, 33) (208, 32)


In [18]:
cols = [ 'essay_clean_spellchecked', 'emotion', ]
with pd.option_context('display.max_colwidth', None):
    display(df_dev[cols].head(25))

Unnamed: 0,essay_clean_spellchecked,emotion
0,"How sad is it that this kind of pain and suffering, and those kind of living conditions still exist today? what a gap we have in society between developed countries and those that aren't. It's crazy to drive around the US and see all the money people spend on pointless things, and then to think about how the people in Haiti are living.",[Sadness]
1,The article is kind of tragic and hits close to home as I am the son of Haitian immigrants. Haiti has a lot of problems that only become exaggerated during natural disasters. I think what the Haitian people really need from the international community is help developing infrastructure so they can address these issues themselves. Foreign aid only acts as a band aid.,[Sadness]
2,"I think that these kinds of stories, are sad, yet inspirational and leave you with kind of a good feeling. Even though his story is sad, it's cool and inspiring/motivational to see that he rose up against his circumstances. That he worked hard to make something of himself and he succeeded in what he wanted to do.",[Sadness]
3,It's crazy that random accidents like this happen everyday. I am not a baseball fan but of course enjoy a baseball game every now and again. I lived and worked in Miami too so I am vaguely familiar with that baseball player who unfortunately passed away. The effort to save him was great but unfortunately bad things seem to happen every day. He was so young too so it makes it worse.,[Neutral]
4,"This story makes me so so sad.... As someone who also grew up in the system, I can strongly relate. It's sad that America has not figured out a better and more safe system to handle kid's without parents or with parents who are unfit. A lot of the times, the system is no better, or even worse than the situation kids were in before, and I think this story is a good example of that.",[Sadness]
5,"After reading the article, my first reaction and feeling is that i feel really bad for the brothers. I feel like people their age should not have to be locked inside a jail cell. They should be out in the world improving themselves and being normal people. It's also really sad for the family members of these brothers as well because they are probably all suffering and worrying.",[Sadness]
6,"I didn't know coal mining had such adverse effects on the surrounding environment. It has basically ruined the lives of the people who live nearby these mines. And the animal populations too, imagine a heard of elephants not able to sustain themselves with the food available and needing to invade human territory...They must really be in a desperate situation.",[Neutral]
7,"This is very sad. I can't imagine having elephants come stampede my house in the middle of the night. What a terrible and sad situation, and these poor people can't even do anything about it. Someone needs to stop the deforestation and stop polluting the air these people breathe, it is not right that they are doing and all for the sake of turning a profit.",[Sadness]
8,"Guys, reading this article really hits home for me. If you or someone you know is having suicidal thoughts, please get help from the available sources. Suicide is no joke and it is a shame when someone does not get the help they need. I've struggled with this for a few years now but I got the help I needed. This woman was not as fortunate.",[Sadness]
9,Hey guys. So I just read this article about Iraqi Christians being persecuted by Muslims in Iraq. I don't understand why people of different religious backgrounds can't get along there. I'm sure it is a cultural thing but it is such unnecessary violence and conflict. It hurts both sides and I wish there was a way we could get them to set aside their differences. But not military action. We don't need another war.,[Neutral]


# Few-Shot Approach 1: concatenate random examples into one long string

## 1a. Divide training set into random 208 chunks
This is done in a reproducible way - the same random_states are used (exact same results are confirmed in multiple runs)

Options:
1. Keep double categories when concatenating examples
2. 1 essay w/1 category (using explode()) before concatenating examples

In [15]:
openai.api_key = os.getenv("OPENAI_API_KEY2")
model          = 'gpt-3.5-turbo'

In [67]:
num_iter       = df_dev.shape[0]
max_tokens     = 3200
text_col       = 'essay_clean_spellchecked'
emotion_col    = 'emotion_no_2nd_neut'
essays_sampled = []
example_dfs    = []

for i in range(num_iter):
    res = []
    for idx, (emo, num) in enumerate(to_sample.items()):
        temp = df_train.copy().explode(emotion_col)
        temp = temp[ ~temp[text_col].isin(essays_sampled) ]
        temp = temp[ temp[emotion_col] == emo ]
        
        # if not enough data to sample, sample from the entire dataframe
        if temp.shape[0] < num:
            temp = df_train.copy().explode(emotion_col)
            temp = temp[ temp[emotion_col] == emo ]

        essays     = temp.sample(n=num, random_state=i)[text_col].tolist()
        df_sampled = df_train[ df_train[text_col].isin(essays) ][[text_col, emotion_col]]
        res.append( df_sampled )
        essays_sampled.extend( essays )

    df_sampled_combined = pd.concat(res).sample(frac=1, random_state=random_state)
    # explode() helps avoide double categories in examples
    df_sampled_combined = df_sampled_combined.explode( emotion_col ).sample(frac=1, random_state=random_state)
    df_sampled_combined = df_sampled_combined.drop_duplicates()
        
    # reduce size to fit into the context window by removing overrepresented categories
    while num_tokens_from_messages( get_dummy_messages(df_sampled_combined, text_col),
                                    model="gpt-3.5-turbo-0301",
                                  ) > max_tokens:
        df_sampled_combined = df_sampled_combined.sample(frac=1, random_state=i)
        size = df_sampled_combined.shape[0]
        df_sampled_combined = df_sampled_combined.head(size-1)
        
        '''
        # old way, works worse after explode()
        overrepresented = ['Sadness', 'Neutral']
        count = 0
        while df_sampled_combined.tail(1)[emotion_col].values[0] not in overrepresented:
            df_sampled_combined = df_sampled_combined.sample(frac=1, random_state=i)
            count += 1
            if count >= 100:
                break
        size = df_sampled_combined.shape[0]
        df_sampled_combined = df_sampled_combined.head(size-1)
        '''
    example_dfs.append( df_sampled_combined )

    print('Size of combined df:', df_sampled_combined.shape )
    num_tokens = num_tokens_from_messages( get_dummy_messages(df_sampled_combined, text_col),
                                           model="gpt-3.5-turbo-0301",
                                         )
    print('Number of tokens:', num_tokens)
    print('Value counts:\n', df_sampled_combined.explode(emotion_col)[emotion_col].value_counts(),sep='')
    print('Index:', sorted(df_sampled_combined.index.tolist()))
    print('\n', '='*75, '\n', sep='')

Size of combined df: (34, 2)
Number of tokens: 3157
Value counts:
Sadness     12
Disgust      4
Fear         4
Hope         4
Anger        3
Neutral      3
Joy          2
Surprise     2
Name: emotion_no_2nd_neut, dtype: int64
Index: [32, 35, 38, 90, 107, 111, 171, 216, 225, 238, 246, 246, 295, 295, 336, 342, 371, 371, 373, 383, 400, 426, 430, 478, 494, 561, 574, 593, 593, 617, 617, 620, 687, 729]


Size of combined df: (36, 2)
Number of tokens: 3110
Value counts:
Sadness     7
Disgust     6
Anger       6
Neutral     5
Fear        4
Joy         3
Surprise    3
Hope        2
Name: emotion_no_2nd_neut, dtype: int64
Index: [29, 54, 93, 157, 157, 202, 298, 347, 375, 415, 420, 424, 460, 466, 509, 522, 532, 567, 567, 575, 612, 626, 631, 650, 650, 662, 662, 664, 664, 681, 682, 703, 734, 739, 751, 751]


Size of combined df: (37, 2)
Number of tokens: 3194
Value counts:
Sadness     10
Anger        7
Neutral      6
Disgust      4
Hope         3
Fear         3
Joy          2
Surprise     2
Name: e

Size of combined df: (34, 2)
Number of tokens: 3196
Value counts:
Sadness     10
Disgust      5
Anger        4
Fear         4
Joy          3
Surprise     3
Neutral      3
Hope         2
Name: emotion_no_2nd_neut, dtype: int64
Index: [14, 29, 37, 38, 50, 94, 102, 102, 111, 151, 181, 181, 215, 269, 375, 375, 388, 416, 438, 467, 504, 553, 561, 561, 593, 593, 608, 619, 619, 627, 703, 740, 786, 791]


Size of combined df: (37, 2)
Number of tokens: 3170
Value counts:
Sadness     11
Anger        8
Hope         4
Neutral      4
Disgust      3
Joy          3
Fear         3
Surprise     1
Name: emotion_no_2nd_neut, dtype: int64
Index: [0, 11, 11, 29, 35, 63, 80, 102, 162, 172, 271, 296, 348, 374, 386, 391, 431, 438, 453, 453, 510, 531, 536, 536, 538, 538, 540, 574, 582, 593, 665, 684, 694, 694, 703, 703, 745]


Size of combined df: (34, 2)
Number of tokens: 3151
Value counts:
Sadness     11
Anger        6
Hope         4
Fear         4
Neutral      3
Surprise     3
Joy          3
Name: emotion_no

Size of combined df: (36, 2)
Number of tokens: 3148
Value counts:
Sadness     13
Disgust      5
Neutral      4
Anger        4
Fear         4
Joy          3
Hope         2
Surprise     1
Name: emotion_no_2nd_neut, dtype: int64
Index: [32, 37, 107, 107, 135, 146, 149, 157, 157, 253, 253, 263, 295, 295, 319, 325, 344, 377, 388, 491, 524, 527, 546, 561, 561, 593, 593, 643, 655, 657, 723, 734, 751, 751, 754, 783]


Size of combined df: (37, 2)
Number of tokens: 3143
Value counts:
Sadness     13
Anger        6
Neutral      5
Fear         4
Disgust      3
Hope         3
Joy          2
Surprise     1
Name: emotion_no_2nd_neut, dtype: int64
Index: [47, 80, 80, 109, 180, 180, 216, 216, 225, 264, 285, 300, 331, 354, 355, 355, 371, 371, 373, 398, 402, 436, 436, 450, 525, 620, 643, 650, 650, 652, 662, 674, 689, 763, 783, 786, 790]


Size of combined df: (38, 2)
Number of tokens: 3158
Value counts:
Sadness     12
Hope         5
Anger        5
Disgust      5
Neutral      4
Surprise     3
Joy         

Size of combined df: (35, 2)
Number of tokens: 3142
Value counts:
Sadness     12
Disgust      6
Neutral      5
Anger        4
Fear         3
Joy          2
Surprise     2
Hope         1
Name: emotion_no_2nd_neut, dtype: int64
Index: [7, 15, 35, 95, 140, 156, 187, 200, 233, 295, 313, 316, 392, 410, 500, 506, 556, 556, 591, 620, 621, 621, 655, 692, 710, 718, 721, 734, 734, 736, 743, 743, 746, 751, 751]


Size of combined df: (33, 2)
Number of tokens: 3188
Value counts:
Sadness     10
Neutral      5
Disgust      4
Anger        4
Joy          3
Fear         3
Hope         2
Surprise     2
Name: emotion_no_2nd_neut, dtype: int64
Index: [15, 68, 68, 147, 155, 203, 210, 230, 244, 295, 310, 321, 321, 321, 355, 355, 385, 385, 438, 483, 561, 561, 574, 579, 592, 650, 725, 725, 739, 740, 763, 769, 790]


Size of combined df: (39, 2)
Number of tokens: 3181
Value counts:
Sadness     15
Anger        7
Neutral      4
Disgust      4
Joy          3
Surprise     2
Fear         2
Hope         2
Name: emot

Size of combined df: (32, 2)
Number of tokens: 3155
Value counts:
Sadness     8
Anger       6
Disgust     4
Surprise    3
Neutral     3
Hope        3
Joy         3
Fear        2
Name: emotion_no_2nd_neut, dtype: int64
Index: [147, 157, 157, 169, 207, 279, 295, 295, 300, 321, 321, 321, 348, 396, 402, 436, 522, 525, 538, 538, 609, 617, 617, 643, 705, 705, 706, 714, 714, 715, 766, 776]


Size of combined df: (40, 2)
Number of tokens: 3147
Value counts:
Sadness     10
Neutral      6
Anger        6
Hope         5
Disgust      5
Fear         3
Joy          3
Surprise     2
Name: emotion_no_2nd_neut, dtype: int64
Index: [29, 32, 32, 35, 62, 74, 95, 107, 111, 115, 200, 200, 207, 258, 269, 274, 279, 279, 290, 320, 338, 338, 343, 343, 393, 397, 493, 574, 576, 592, 593, 593, 658, 681, 706, 715, 725, 754, 754, 765]


Size of combined df: (32, 2)
Number of tokens: 3099
Value counts:
Sadness     11
Neutral      5
Anger        4
Disgust      3
Fear         3
Joy          3
Hope         2
Surprise    

Size of combined df: (33, 2)
Number of tokens: 3184
Value counts:
Sadness     9
Disgust     5
Anger       4
Hope        4
Neutral     4
Joy         3
Fear        3
Surprise    1
Name: emotion_no_2nd_neut, dtype: int64
Index: [73, 82, 139, 181, 181, 189, 189, 225, 240, 240, 252, 252, 278, 295, 295, 382, 402, 438, 479, 479, 486, 573, 617, 617, 619, 619, 621, 621, 634, 653, 683, 694, 748]


Size of combined df: (38, 2)
Number of tokens: 3160
Value counts:
Sadness     13
Neutral      6
Anger        5
Disgust      5
Surprise     3
Hope         2
Joy          2
Fear         2
Name: emotion_no_2nd_neut, dtype: int64
Index: [10, 10, 29, 103, 103, 156, 295, 301, 379, 432, 436, 436, 437, 489, 536, 561, 565, 571, 633, 648, 648, 664, 664, 666, 679, 693, 699, 702, 703, 734, 734, 746, 751, 751, 755, 773, 774, 778]


Size of combined df: (33, 2)
Number of tokens: 3176
Value counts:
Sadness     10
Anger        5
Neutral      5
Fear         5
Hope         3
Surprise     2
Disgust      2
Joy          1


Size of combined df: (35, 2)
Number of tokens: 3128
Value counts:
Sadness     11
Anger        5
Disgust      5
Hope         4
Fear         3
Joy          3
Neutral      2
Surprise     2
Name: emotion_no_2nd_neut, dtype: int64
Index: [21, 34, 54, 127, 128, 152, 177, 189, 189, 198, 256, 278, 299, 346, 352, 355, 355, 375, 375, 386, 438, 458, 472, 504, 527, 593, 617, 617, 664, 664, 694, 694, 726, 731, 751]


Size of combined df: (37, 2)
Number of tokens: 3178
Value counts:
Sadness     11
Fear         5
Anger        5
Disgust      4
Surprise     3
Joy          3
Neutral      3
Hope         3
Name: emotion_no_2nd_neut, dtype: int64
Index: [9, 9, 21, 40, 164, 187, 244, 245, 273, 298, 298, 338, 338, 373, 373, 388, 402, 436, 436, 438, 439, 466, 478, 479, 479, 503, 536, 556, 556, 631, 631, 678, 705, 725, 725, 736, 737]


Size of combined df: (33, 2)
Number of tokens: 3135
Value counts:
Sadness     8
Neutral     6
Fear        5
Hope        4
Joy         3
Anger       3
Disgust     3
Surprise    1

Size of combined df: (36, 2)
Number of tokens: 3083
Value counts:
Sadness     10
Disgust      7
Anger        5
Hope         4
Neutral      4
Surprise     3
Fear         2
Joy          1
Name: emotion_no_2nd_neut, dtype: int64
Index: [80, 94, 100, 121, 121, 155, 181, 181, 189, 200, 200, 286, 288, 294, 295, 323, 338, 346, 456, 500, 525, 561, 561, 567, 567, 578, 590, 604, 649, 653, 725, 725, 746, 753, 755, 770]


Size of combined df: (34, 2)
Number of tokens: 3159
Value counts:
Sadness     9
Anger       8
Disgust     5
Fear        3
Joy         3
Neutral     3
Surprise    2
Hope        1
Name: emotion_no_2nd_neut, dtype: int64
Index: [34, 103, 107, 116, 118, 142, 157, 157, 162, 163, 187, 211, 249, 295, 295, 355, 355, 476, 478, 478, 480, 514, 514, 556, 556, 567, 569, 569, 595, 595, 599, 746, 751, 751]


Size of combined df: (37, 2)
Number of tokens: 3153
Value counts:
Sadness     13
Neutral      5
Fear         5
Anger        4
Surprise     4
Disgust      2
Joy          2
Hope         2
Nam

Size of combined df: (34, 2)
Number of tokens: 3181
Value counts:
Sadness     9
Neutral     6
Anger       5
Disgust     4
Fear        3
Surprise    3
Joy         2
Hope        2
Name: emotion_no_2nd_neut, dtype: int64
Index: [21, 53, 169, 225, 233, 239, 260, 266, 285, 295, 309, 315, 315, 321, 321, 346, 361, 362, 403, 415, 436, 461, 522, 538, 538, 608, 621, 634, 654, 751, 751, 755, 778, 778]


Size of combined df: (36, 2)
Number of tokens: 3181
Value counts:
Sadness     9
Anger       7
Disgust     4
Neutral     4
Joy         3
Fear        3
Surprise    3
Hope        3
Name: emotion_no_2nd_neut, dtype: int64
Index: [110, 117, 144, 157, 157, 163, 209, 226, 270, 276, 284, 315, 315, 355, 355, 416, 421, 441, 477, 477, 479, 479, 549, 598, 619, 619, 662, 662, 681, 690, 703, 703, 716, 717, 739, 774]


Size of combined df: (33, 2)
Number of tokens: 3176
Value counts:
Sadness     12
Hope         4
Fear         4
Neutral      4
Disgust      3
Joy          2
Anger        2
Surprise     2
Name: emot

Size of combined df: (37, 2)
Number of tokens: 3154
Value counts:
Sadness     12
Disgust      6
Surprise     4
Anger        4
Hope         4
Neutral      4
Joy          2
Fear         1
Name: emotion_no_2nd_neut, dtype: int64
Index: [14, 31, 65, 94, 102, 167, 181, 188, 206, 216, 216, 279, 279, 290, 295, 321, 321, 347, 350, 355, 371, 406, 436, 453, 453, 461, 494, 505, 536, 536, 542, 542, 618, 671, 702, 712, 722]


Size of combined df: (36, 2)
Number of tokens: 3145
Value counts:
Sadness     11
Disgust      6
Anger        6
Joy          3
Hope         3
Neutral      3
Fear         2
Surprise     2
Name: emotion_no_2nd_neut, dtype: int64
Index: [27, 47, 82, 147, 157, 157, 254, 315, 315, 321, 321, 321, 343, 355, 355, 382, 402, 411, 416, 453, 514, 514, 532, 619, 662, 662, 679, 688, 714, 714, 721, 725, 725, 751, 751, 769]


Size of combined df: (38, 2)
Number of tokens: 3142
Value counts:
Sadness     10
Neutral      5
Anger        5
Fear         5
Hope         5
Disgust      4
Surprise     2

Size of combined df: (34, 2)
Number of tokens: 3184
Value counts:
Sadness     8
Anger       6
Disgust     5
Fear        4
Neutral     4
Hope        3
Joy         2
Surprise    2
Name: emotion_no_2nd_neut, dtype: int64
Index: [6, 11, 11, 21, 157, 157, 181, 181, 185, 189, 207, 244, 276, 276, 296, 391, 426, 449, 511, 574, 593, 593, 599, 605, 631, 631, 633, 690, 727, 748, 748, 756, 773, 790]


Size of combined df: (36, 2)
Number of tokens: 3185
Value counts:
Sadness     11
Neutral      6
Disgust      5
Anger        4
Fear         3
Joy          3
Hope         3
Surprise     1
Name: emotion_no_2nd_neut, dtype: int64
Index: [0, 0, 32, 32, 35, 107, 154, 189, 324, 344, 351, 382, 402, 449, 449, 464, 469, 479, 497, 514, 514, 525, 527, 527, 599, 602, 628, 629, 638, 665, 681, 692, 731, 750, 763, 765]


Size of combined df: (31, 2)
Number of tokens: 3091
Value counts:
Sadness     11
Neutral      6
Joy          3
Hope         3
Surprise     2
Anger        2
Disgust      2
Fear         2
Name: emotio

Size of combined df: (34, 2)
Number of tokens: 3166
Value counts:
Sadness     13
Neutral      6
Disgust      5
Surprise     3
Anger        3
Hope         2
Fear         2
Name: emotion_no_2nd_neut, dtype: int64
Index: [32, 32, 78, 112, 156, 157, 226, 240, 295, 313, 313, 314, 340, 373, 373, 384, 395, 405, 465, 483, 494, 561, 582, 599, 599, 631, 631, 710, 731, 751, 751, 765, 773, 773]


Size of combined df: (35, 2)
Number of tokens: 3163
Value counts:
Sadness     12
Anger        5
Neutral      5
Disgust      4
Surprise     3
Joy          2
Fear         2
Hope         2
Name: emotion_no_2nd_neut, dtype: int64
Index: [9, 9, 12, 21, 64, 189, 189, 216, 292, 302, 342, 346, 377, 409, 449, 484, 496, 553, 556, 560, 565, 589, 589, 599, 599, 602, 662, 662, 681, 706, 714, 769, 778, 778, 788]


Size of combined df: (37, 2)
Number of tokens: 3186
Value counts:
Sadness     9
Fear        5
Hope        5
Anger       4
Neutral     4
Disgust     4
Surprise    3
Joy         3
Name: emotion_no_2nd_neut, dty

In [69]:
# smallest number of examples?
from collections import Counter
lengths = [df_.shape[0] for df_ in example_dfs]
print('Mean length:', np.mean(lengths))
c = Counter(lengths)
c.most_common()

Mean length: 35.78846153846154


[(36, 44),
 (38, 33),
 (34, 32),
 (35, 28),
 (37, 27),
 (33, 14),
 (39, 13),
 (32, 8),
 (40, 4),
 (31, 4),
 (30, 1)]

This means that having 34-35 essays per one set of examples will ensure that the total number of tokens for this set of examples will not exceed the context window size of 4096 tokens

In [102]:
# 3.7k to 3.9k tokens per example
examples  = []
for df_ in example_dfs:
    example = prompt_one.strip() + '\n'
    for text, emo in df_[[text_col, emotion_col]].values:
        #example += f"\nText: {text}\n\nCategory: {'/'.join(emo)}.\n\n####\n"    #w/out explode()
        example += f"\nEssay: {text}\n\nCategory: {emo}.\n\n####\n"     #w/explode()
    examples.append(example)
        
lengths = [ num_tokens_from_messages([{'role':'user', 'content': example} ]) for example in examples ]
print(lengths)
print([l for l in lengths if l <3400])
print([l for l in lengths if l > 3800])

[3630, 3603, 3692, 3565, 3547, 3638, 3631, 3677, 3675, 3625, 3702, 3676, 3621, 3686, 3658, 3672, 3653, 3626, 3626, 3665, 3669, 3668, 3617, 3645, 3647, 3563, 3636, 3705, 3687, 3639, 3680, 3613, 3697, 3624, 3614, 3669, 3625, 3664, 3696, 3598, 3643, 3641, 3667, 3716, 3660, 3681, 3645, 3672, 3588, 3674, 3641, 3649, 3647, 3535, 3645, 3631, 3655, 3641, 3665, 3626, 3652, 3700, 3709, 3653, 3638, 3655, 3637, 3657, 3613, 3628, 3569, 3535, 3660, 3652, 3658, 3602, 3611, 3670, 3552, 3671, 3590, 3624, 3616, 3656, 3714, 3633, 3622, 3644, 3678, 3628, 3656, 3669, 3686, 3595, 3647, 3671, 3635, 3628, 3629, 3632, 3668, 3607, 3701, 3689, 3562, 3697, 3701, 3595, 3545, 3701, 3675, 3609, 3680, 3593, 3612, 3702, 3700, 3619, 3669, 3559, 3612, 3649, 3580, 3590, 3585, 3690, 3570, 3627, 3577, 3635, 3650, 3628, 3683, 3550, 3648, 3621, 3669, 3496, 3531, 3575, 3688, 3693, 3669, 3690, 3586, 3652, 3669, 3640, 3562, 3654, 3643, 3632, 3666, 3628, 3616, 3694, 3608, 3656, 3672, 3722, 3621, 3629, 3591, 3690, 3656, 3639, 364

In [103]:
print(examples[0])

1 - Below you are given examples of essays with categories separated by four hashtags.
2 - Each essay has one or two relevant categories from the following list: Sadness, Neutral, Anger, Disgust, Fear, Hope, Surprise, Joy.
3 - Your task is to carefully learn from the examples of essays with categories in order to understand what features or words in the essays make them belong to a specific category and then use this knowledge to assign the correct relevant category from the above list to the very last essay.
4 - You may add a second category from the above list ONLY AND ONLY IF it is also relevant to the very last essay.
5 - Output just the category or categories for the last essay and nothing else. If there are two relevant categories: sort them alphabetically, concatenate with a forward slash, and output only them and nothing else.

####

Essay: I feel like overall it is quite sad that this happened and it should have been prevented by the proper authorities. I feel like as a people

In [104]:
def classify_text_few_shot(prompt_):
    '''Classify text_ using prompt_ and ChatGPT API'''
        
    # compose messages and check num_tokens
    messages = [
            #{ "role": "system", "content": "You are a helpful emotion classifier.", },
            { "role": "user", "content": prompt_, },
            ]
    #if not verify_num_tokens(model, messages): return None
    label_    = get_response(model, messages)
    old_label = label_
    label_    = verify_label(label_)        # get just the category if response is too long
    print('First iteration:', old_label, label_)
        
    # if label not found in response text - second, extended chat
    if label_ is None:
        messages += [
            { "role": "assistant", "content": old_label, },
            { "role": "user", "content": followup, }
            ]        
        label_    = get_response(model, messages)        
        old_label = label_
        label_    = verify_label(label_)        # get just the category if response is too long
        print('\tSecond iteration:', old_label, label_)
            
    return label_ if label_ is not None else old_label

In [105]:
start  = time.time()
res    = dict()
count1 = 0
count2 = 0
for t in df_dev[text_col].tolist():
    if t in res:
        continue
    if count2 >= len(examples):
        count2 = 0
    prompt = examples[ count2 ].strip() + f'\nEssay: {t}\n\nCategory:'
    count2 += 1
    try:
        res[ t ] = classify_text_few_shot(prompt)
    except openai.error.RateLimitError:
        print(f'\nText: {t}.\nRate limit error\n')
    except Exception as e:
        print(f'\nText: {t}\nError: {e}\n')
                
    count1 += 1    
    if count1 % 10 == 0:
        print(f'Processing text {count1}; example {count2-1}')
        with open('data/res.pkl', 'wb') as f:
            pickle.dump(res, f, protocol=pickle.HIGHEST_PROTOCOL)
                        
        
elapsed = (time.time() - start)/60
print(f'\nTime elapsed {round(elapsed, 4)} min')

First iteration: Sadness. Sadness
First iteration: Hope. Hope
First iteration: Hope. Hope
First iteration: Sadness. Sadness
First iteration: Sadness. Sadness
First iteration: Sadness. Sadness
First iteration: Disgust/Sadness. Disgust/Sadness
First iteration: Sadness. Sadness
First iteration: Hope. Hope
First iteration: Anger. Anger
Processing text 10; example 9
First iteration: Fear. Fear
First iteration: Sadness. Sadness
First iteration: Anger. Anger
First iteration: Hope. Hope
First iteration: Sadness. Sadness
First iteration: Sadness. Sadness
First iteration: Sadness. Sadness
First iteration: Disgust. Disgust
First iteration: Sadness. Sadness
First iteration: Hope. Hope
Processing text 20; example 19
First iteration: Fear/Sadness. Fear/Sadness
First iteration: Sadness. Sadness
First iteration: Anger. Anger
First iteration: Sadness. Sadness
First iteration: Sadness. Sadness
First iteration: Disgust. Disgust
First iteration: Disgust. Disgust
First iteration: Sadness. Sadness
First ite

In [106]:
len(examples), len(res)

(208, 208)

### If a followup question was not used

In [107]:
# if one label is the output
df_dev['pred_all'] = df_dev[text_col].map( res )
print(df_dev.isna().sum())
print(df_dev['pred_all'].value_counts())

article_id                    0
conversation_id               0
speaker_number                0
essay_id                      0
speaker_id                    0
essay                         0
essay_clean                   0
split                         0
gender                        0
education                     0
race                          0
age                           0
income                        0
emotion                       0
emotion_count                 0
char_length                   0
word_length                   0
target_encoded                0
article                       0
article_clean                 0
essay_clean_docs              0
essay_clean_spellchecked      0
article_clean_docs            0
article_clean_spellchecked    0
compare1                      0
compare2                      0
gpt_embedding                 0
closest_texts                 0
emotion_no_2nd_neut           0
pred_all                      0
pred_encoded                  0
dtype: i

In [97]:
print(df_dev[ df_dev['pred_all']=="Yes, I am sure. The category is Concern." ])
df_dev.at[ 127, 'pred_all' ] = 'Fear'
df_dev.at[ 76, 'pred_all' ] = 'Fear'
df_dev.at[ 18, 'pred_all' ] = 'Sadness'
df_dev.at[ 48, 'pred_all' ] = 'Neutral'
df_dev.at[ 80, 'pred_all' ] = 'Neutral'
df_dev.at[ 115, 'pred_all' ] = 'Anger'
df_dev.at[ 74, 'pred_all' ] = 'Neutral'
df_dev.at[ 158, 'pred_all' ] = 'Neutral'

     article_id  conversation_id  speaker_number  essay_id  speaker_id  \
158          31              204               2       703          93   

                                                 essay  \
158  Hey man, Apparently, Indonesia is going throug...   

                                           essay_clean split  gender  \
158  Hey man, Apparently, Indonesia is going throug...   dev       1   

     education  race  age  income    emotion  emotion_count  char_length  \
158          7     5   41   64000  [Neutral]              1          467   

     word_length            target_encoded  \
158           75  [0, 0, 0, 0, 0, 1, 0, 0]   

                                               article  \
158  A Palm Oil Company Threatens The Third Largest...   

                                         article_clean  \
158  A Palm Oil Company Threatens The Third Largest...   

                                      essay_clean_docs  \
158  (Hey, man, ,, Apparently, ,, Indonesia, is, go

In [108]:
df_dev['pred_all'].value_counts()

Sadness            91
Anger              22
Disgust            22
Hope               21
Fear               18
Neutral            16
Fear/Sadness        5
Disgust/Sadness     3
Anger/Sadness       3
Hope/Sadness        2
Disgust/Fear        2
Disgust/Hope        1
Fear/Hope           1
Surprise            1
Name: pred_all, dtype: int64

In [109]:
# binarize predictions
df_dev['pred_encoded'] = df_dev['pred_all'].apply( lambda x: get_target(x.split('/')) )

In [110]:
y_dev_encoded      = np.array( df_dev['target_encoded'].values.tolist() )
y_dev_pred_encoded = np.array( df_dev['pred_encoded'].values.tolist() )
labels = list(label2key.keys())
print( classification_report( y_dev_encoded, y_dev_pred_encoded, target_names=labels, digits=4 ) )

              precision    recall  f1-score   support

       Anger     0.5600    0.3684    0.4444        38
     Disgust     0.5000    0.5833    0.5385        24
        Fear     0.2692    0.8750    0.4118         8
        Hope     0.2400    0.3750    0.2927        16
         Joy     0.0000    0.0000    0.0000         2
     Neutral     0.6875    0.2037    0.3143        54
     Sadness     0.7788    0.8020    0.7902       101
    Surprise     0.0000    0.0000    0.0000         3

   micro avg     0.5911    0.5407    0.5648       246
   macro avg     0.3794    0.4009    0.3490       246
weighted avg     0.6303    0.5407    0.5471       246
 samples avg     0.6058    0.5601    0.5681       246



  _warn_prf(average, modifier, msg_start, len(result))


### If a followup question was used

In [36]:
# if a followup question was used - clean reponse
df_dev['pred'] = df_dev['pred_all'].apply( lambda x: x[3] )
print('Null values:\n', df_dev.isna().sum(), sep='')
df_dev['pred'].value_counts()

Null values:
article_id                    0
conversation_id               0
speaker_number                0
essay_id                      0
speaker_id                    0
essay                         0
essay_clean                   0
split                         0
gender                        0
education                     0
race                          0
age                           0
income                        0
emotion                       0
emotion_count                 0
char_length                   0
word_length                   0
target_encoded                0
article                       0
article_clean                 0
essay_clean_docs              0
essay_clean_spellchecked      0
article_clean_docs            0
article_clean_spellchecked    0
compare1                      0
compare2                      0
pred_all                      0
pred_encoded                  0
pred                          3
dtype: int64


Sadness             54
Fear/Sadness        31
Hope/Sadness        24
Anger/Sadness       23
Disgust/Sadness     19
Anger/Disgust        9
Hope                 8
Fear                 6
Disgust              5
Neutral/Sadness      4
Anger/Fear           4
Hope/Neutral         4
Disgust/Fear         3
Anger                2
Anger/Neutral        2
Neutral              2
Fear/Neutral         1
Disgust/Neutral      1
Sadness/Surprise     1
Joy                  1
Fear/Surprise        1
Name: pred, dtype: int64

In [42]:
# if a followup question was used - review why some predictions were NaNs
temp = df_dev[ df_dev['pred'].isna() ]
print(temp.index)
temp[['pred_all', 'pred']].values.tolist()

Int64Index([150, 167, 173], dtype='int64')


[[('Neutral',
   'Neutral',
   'I think the text is more related to Disappointment.',
   None),
  None],
 [('Sadness',
   'Sadness',
   "Yes, I am sure. The text expresses concern about children not being able to eat and the country's failure to provide basic necessities to its people, which is a sad situation.",
   None),
  None],
 [('Hope/Sadness',
   'Hope/Sadness',
   "Yes, I'm sure. The text expresses sympathy for the situation but also a sense of helplessness, and ends with a hopeful attitude towards the future.",
   None),
  None]]

In [43]:
# if a followup question was used - manually assign missing predictions
df_dev.at[150, 'pred'] = 'Neutral'
df_dev.at[167, 'pred'] = 'Sadness'
df_dev.at[173, 'pred'] = 'Hope/Sadness'

In [61]:
# binarize predictions
df_dev['pred_encoded'] = df_dev['pred'].apply( lambda x: get_target(x.split('/')) )

In [None]:
y_dev_encoded      = np.array( df_dev['target_encoded'].values.tolist() )
y_dev_pred_encoded = np.array( df_dev['pred_encoded'].values.tolist() )
labels = list(label2key.keys())
print( classification_report( y_dev_encoded, y_dev_pred_encoded, target_names=labels, digits=4 ) )

# Few-Shot Approach 2: concatenate closest 30 examples into one long string

In [55]:
openai.api_key = os.getenv("OPENAI_API_KEY2")
model          = 'gpt-3.5-turbo'

In [30]:
# find top_n closest df_train embeddings for each df_dev embedding
def batch_cosine(embedding_, df, top_n=100):
    df['similarity'] = df['gpt_embedding'].apply(lambda x: cosine_similarity(x, embedding_))
    return df.sort_values(by='similarity', ascending=False).head(top_n)['essay_clean'].tolist()

df_train_copy = df_train.copy()
start = time.time()
res   = dict()
count = 0
for t, e in df_dev[['essay_clean', 'gpt_embedding']].values:
    if t in res:
        continue
    res[ t ] = batch_cosine( e, df_train_copy, top_n=30, )
    count += 1
    if count % 10 == 0:
        print(f'Processing text {count}. Time elapsed: {round((time.time()-start)/60, 4)} min')
        with open('data/res.pkl', 'wb') as f:
            pickle.dump(res, f, protocol=pickle.HIGHEST_PROTOCOL)
                        
elapsed = (time.time() - start)/60
print(f'\nTime elapsed {round(elapsed, 4)} min')

Processing text 10. Time elapsed: 0.0373 min
Processing text 20. Time elapsed: 0.074 min
Processing text 30. Time elapsed: 0.1105 min
Processing text 40. Time elapsed: 0.1474 min
Processing text 50. Time elapsed: 0.1841 min
Processing text 60. Time elapsed: 0.2211 min
Processing text 70. Time elapsed: 0.2579 min
Processing text 80. Time elapsed: 0.2952 min
Processing text 90. Time elapsed: 0.3326 min
Processing text 100. Time elapsed: 0.3696 min
Processing text 110. Time elapsed: 0.4074 min
Processing text 120. Time elapsed: 0.445 min
Processing text 130. Time elapsed: 0.4825 min
Processing text 140. Time elapsed: 0.5202 min
Processing text 150. Time elapsed: 0.5575 min
Processing text 160. Time elapsed: 0.595 min
Processing text 170. Time elapsed: 0.6326 min
Processing text 180. Time elapsed: 0.671 min
Processing text 190. Time elapsed: 0.709 min
Processing text 200. Time elapsed: 0.747 min

Time elapsed 0.7774 min


In [31]:
df_dev['closest_texts'] = df_dev['essay_clean'].map( res )
print(df_dev.isna().sum())

article_id                    0
conversation_id               0
speaker_number                0
essay_id                      0
speaker_id                    0
essay                         0
essay_clean                   0
split                         0
gender                        0
education                     0
race                          0
age                           0
income                        0
emotion                       0
emotion_count                 0
char_length                   0
word_length                   0
target_encoded                0
article                       0
article_clean                 0
essay_clean_docs              0
essay_clean_spellchecked      0
article_clean_docs            0
article_clean_spellchecked    0
compare1                      0
compare2                      0
gpt_embedding                 0
closest_texts                 0
dtype: int64


In [32]:
file = 'data/df_dev.pkl'
df_dev.to_pickle(file)

In [78]:
df_train.columns

Index(['article_id', 'conversation_id', 'speaker_number', 'essay_id',
       'speaker_id', 'essay', 'essay_clean', 'split', 'gender', 'education',
       'race', 'age', 'income', 'emotion', 'target_encoded', 'target_encoded2',
       'compare', 'emotion_count', 'char_length', 'word_length', 'article',
       'article_clean', 'essay_clean_docs', 'essay_clean_spellchecked',
       'article_clean_docs', 'article_clean_spellchecked', 'compare1',
       'compare2', 'gpt_embedding', 'emotion_no_2nd_neut'],
      dtype='object')

In [79]:
# concatenate 30 closest examples for each dev set datapoint and check the total number of tokens
def concatenate_few_show_examples( closest_texts ):
    ''' Concatenate 30 closest examples with their categories into one string '''
    df_ = df_train[ df_train['essay_clean'].isin(closest_texts) ]
    example = prompt_one
    for text, emo in df_[['essay_clean_spellchecked', 'emotion_no_2nd_neut']].values:
        example += f"\nText: {text}\n\nCategory: {'/'.join(sorted(emo))}.\n\n####\n"
    return example

df_dev['closest_texts_concatenated'] = df_dev['closest_texts'].apply( concatenate_few_show_examples )


# check that # tokens in concatenated exapmles doesn't exceed ChatGPT's context window size of 4096
lengths = [ num_tokens_from_messages([{'role':'user', 'content': example} ]) for example\
            in df_dev['closest_texts_concatenated'].values ]
print(lengths)
print([l for l in lengths if l <3000])
print([l for l in lengths if l > 3500])

[3313, 3442, 3131, 2930, 3297, 2964, 3101, 3057, 3215, 3209, 3411, 3226, 3303, 2836, 3143, 2919, 3094, 3051, 3092, 3041, 3076, 3145, 3198, 2972, 3106, 3383, 3174, 3158, 3301, 3304, 3248, 3578, 3088, 3496, 3239, 3236, 3138, 3503, 3398, 3093, 2961, 3032, 3099, 3198, 2906, 2922, 2897, 3201, 3300, 3176, 3368, 2982, 3358, 2995, 2955, 3116, 3228, 3032, 3006, 3178, 3383, 3193, 3050, 3086, 3406, 3327, 3067, 3011, 3051, 3020, 3368, 3264, 3320, 3393, 3292, 3640, 3348, 3241, 3337, 3383, 3050, 3074, 3570, 3543, 3015, 3259, 3436, 3232, 3119, 3230, 3241, 3207, 3207, 3130, 3416, 3257, 3523, 3125, 3135, 3144, 3122, 3409, 3171, 3130, 3227, 3001, 3258, 2760, 3319, 3220, 3113, 3264, 3271, 3459, 3272, 2979, 3626, 3242, 3096, 3179, 3207, 3096, 2917, 3062, 3294, 3102, 3153, 2955, 2957, 3237, 3366, 3103, 3190, 3295, 3151, 3173, 3144, 3211, 3610, 3163, 3150, 3338, 3077, 3403, 3105, 3223, 3135, 2878, 2907, 3278, 3154, 3326, 3386, 3143, 3435, 3410, 3358, 3086, 3043, 2948, 3133, 3537, 3248, 3098, 3147, 3119, 333

In [80]:
print(df_dev['closest_texts_concatenated'].values[0])


1 - Below you are given examples of texts with their most relevant emotion categories. The examples are separated by four hashtags.
2 - Each text can belong to one or two most relevant emotion categories from the following list: Sadness, Neutral, Anger, Disgust, Fear, Hope, Surprise, Joy.
3 - Your task is to carefully learn from the examples of texts with their categories and then use the acquired knowledge to classify the very last text by selecting the most relevant emotion category from the above list.
4 - You may add a second emotion category from the above list ONLY AND ONLY IF it is also relevant to the very last text.
5 - Output just the category or categories for the last text and nothing else. If there are two relevant emotion categories: sort them alphabetically, concatenate with a forward slash, and output only them and nothing else.

####

Text: It breaks my heart to see people living in those conditions. I hope that all the aid that was sent to the island makes it to the 

In [81]:
start = time.time()
res   = dict()
count = 0
for text, example in df_dev[['essay_clean_spellchecked', 'closest_texts_concatenated']].values:
    if text in res:
        continue
    prompt = example + f'\nText: {text}\n\nCategory:'   
    try:
        res[ text ] = classify_text_few_shot(prompt)
    except Exception as e:
        print(f'\nText: {text}\nError: {e}\n')
                
    count += 1    
    if count % 10 == 0:
        print(f'Processing text {count}')
        with open('data/res.pkl', 'wb') as f:
            pickle.dump(res, f, protocol=pickle.HIGHEST_PROTOCOL)
                        
        
elapsed = (time.time() - start)/60
print(f'\nTime elapsed {round(elapsed, 4)} min')

Frist iteration: Sadness. Sadness
Frist iteration: Hope. Hope
Frist iteration: Hope/Joy. Hope/Joy
Frist iteration: Sadness. Sadness
Frist iteration: Sadness. Sadness
Frist iteration: Sadness. Sadness
Frist iteration: Sadness. Sadness
Frist iteration: Sadness. Sadness
Frist iteration: Hope/Sadness. Hope/Sadness
Frist iteration: Anger/Sadness. Anger/Sadness
Processing text 10
Frist iteration: Sadness. Sadness
Frist iteration: Sadness. Sadness
Frist iteration: Sadness. Sadness
Frist iteration: Hope/Sadness. Hope/Sadness
Frist iteration: Sadness. Sadness
Frist iteration: Sadness. Sadness
Frist iteration: Sadness. Sadness
Frist iteration: Disgust/Sadness. Disgust/Sadness
Frist iteration: Sadness/Neutral. Neutral/Sadness
Frist iteration: Hope. Hope
Processing text 20
Frist iteration: Sadness. Sadness
Frist iteration: Sadness. Sadness
Frist iteration: Sadness. Sadness
Frist iteration: Sadness. Sadness
Frist iteration: Hope/Sadness. Hope/Sadness
Frist iteration: Sadness. Sadness
Frist iteratio

### If a followup question was not used

In [82]:
# if one label is the output
df_dev['pred_all'] = df_dev['essay_clean_spellchecked'].map( res )
print(df_dev.isna().sum())
print(df_dev['pred_all'].value_counts())

article_id                    0
conversation_id               0
speaker_number                0
essay_id                      0
speaker_id                    0
essay                         0
essay_clean                   0
split                         0
gender                        0
education                     0
race                          0
age                           0
income                        0
emotion                       0
emotion_count                 0
char_length                   0
word_length                   0
target_encoded                0
article                       0
article_clean                 0
essay_clean_docs              0
essay_clean_spellchecked      0
article_clean_docs            0
article_clean_spellchecked    0
compare1                      0
compare2                      0
gpt_embedding                 0
closest_texts                 0
emotion_no_2nd_neut           0
closest_texts_concatenated    0
pred_all                      0
dtype: i

In [45]:
print(df_dev[ df_dev['pred_all']=="Yes, I'm sure. The category is Disappointment."])
df_dev.at[ 54, 'pred_all' ] = 'Fear'
df_dev.at[ 74, 'pred_all' ] = 'Neutral'
df_dev.at[ 26, 'pred_all' ] = 'Sadness'

    article_id  conversation_id  speaker_number  essay_id  speaker_id  \
26         233               93               1        92          71   

                                                essay  \
26  Unfortunately in countries like these the offi...   

                                          essay_clean split  gender  \
26  Unfortunately in countries like these the offi...   dev       1   

    education  race  age  income    emotion  emotion_count  char_length  \
26          6     2   32   35000  [Disgust]              1          369   

    word_length            target_encoded  \
26           72  [0, 1, 0, 0, 0, 0, 0, 0]   

                                              article  \
26  Nigeria investigates reports that officials ra...   

                                        article_clean  \
26  Nigeria investigates reports that officials ra...   

                                     essay_clean_docs  \
26  (Unfortunately, in, countries, like, these, th...   

        

In [83]:
df_dev['pred_all'].value_counts()

Sadness             95
Neutral             19
Hope/Sadness        15
Disgust             14
Anger/Sadness       12
Disgust/Sadness     12
Anger               10
Hope                 7
Fear                 5
Anger/Disgust        3
Neutral/Sadness      3
Anger/Hope           2
Disgust/Hope         2
Disgust/Fear         2
Fear/Sadness         2
Hope/Joy             1
Disgust/Surprise     1
Fear/Hope            1
Sadness/Surprise     1
Fear/Neutral         1
Name: pred_all, dtype: int64

In [84]:
# binarize predictions
df_dev['pred_encoded'] = df_dev['pred_all'].apply( lambda x: get_target(x.split('/')) )

In [85]:
y_dev_encoded      = np.array( df_dev['target_encoded'].values.tolist() )
y_dev_pred_encoded = np.array( df_dev['pred_encoded'].values.tolist() )
labels = list(label2key.keys())
print( classification_report( y_dev_encoded, y_dev_pred_encoded, target_names=labels, digits=4 ) )

              precision    recall  f1-score   support

       Anger     0.5926    0.4211    0.4923        38
     Disgust     0.4412    0.6250    0.5172        24
        Fear     0.3636    0.5000    0.4211         8
        Hope     0.3214    0.5625    0.4091        16
         Joy     0.0000    0.0000    0.0000         2
     Neutral     0.6957    0.2963    0.4156        54
     Sadness     0.6714    0.9307    0.7801       101
    Surprise     0.0000    0.0000    0.0000         3

   micro avg     0.5789    0.6260    0.6016       246
   macro avg     0.3857    0.4169    0.3794       246
weighted avg     0.5957    0.6260    0.5783       246
 samples avg     0.6178    0.6394    0.6106       246



# Get keywords, title, summary

In [19]:
openai.api_key = os.getenv("OPENAI_API_KEY")
model          = 'gpt-4'
text_col       = 'essay_clean_spellchecked'
emotion_col    = 'emotion_no_2nd_neut'

In [20]:
prompt2 = """Your task is to analyze the text delineated with triple backticks below and extract \
the important keywords and keyphrases. Return your output as one comma separated list.
Text: ```{}``` """
sample_text = 'This is a sample text'
print( prompt2.format(sample_text))

Your task is to analyze the text delineated with triple backticks below and extract the important keywords and keyphrases. Return your output as one comma separated list.
Text: ```This is a sample text``` 


In [21]:
prompt3 = """Your task is to analyze the text delineated with triple backticks below and provide \
a good meaningful title for it. Return just the title and nothing else.
Text: ```{}``` """
sample_text = 'This is a sample text'
print( prompt3.format(sample_text))

Your task is to analyze the text delineated with triple backticks below and provide a good meaningful title for it. Return just the title and nothing else.
Text: ```This is a sample text``` 


In [22]:
prompt4 = """Your task is to carefully analyze and summarize the text delineated with triple backticks \
below. Return just the summary and nothing else.
Text: ```{}``` """
sample_text = 'This is a sample text'
print( prompt4.format(sample_text))

Your task is to carefully analyze and summarize the text delineated with triple backticks below. Return just the summary and nothing else.
Text: ```This is a sample text``` 


In [23]:
def submit_request(prompt_):
    '''Get a response from ChatGPT / GPT-4 API for a specific prompt'''
    messages = [
            #{ "role": "system", "content": "You are a helpful emotion classifier.", },
            { "role": "user", "content": prompt_, },
            ]
    #if not verify_num_tokens(model, messages): return None
    return get_response(model, messages)

In [72]:
start = time.time()
#res   = dict()
count = 0
for text in df_train[text_col].values:
    if text in res:
        continue
    prompt = prompt2.format(text)   
    try:
        res[ text ] = submit_request( prompt )
    except Exception as e:
        print(f'\nText: {text}\nError: {e}\n')
                
    count += 1    
    if count % 10 == 0:
        print(f'Processing text {count}')                        
        
elapsed = (time.time() - start)/60
print(f'\nTime elapsed {round(elapsed, 4)} min')

Processing text 10
Processing text 20
Processing text 30
Processing text 40
Processing text 50
Processing text 60
Processing text 70
Processing text 80
Processing text 90
Processing text 100
Processing text 110
Processing text 120
Processing text 130
Processing text 140
Processing text 150
Processing text 160
Processing text 170
Processing text 180
Processing text 190
Processing text 200
Processing text 210
Processing text 220
Processing text 230
Processing text 240
Processing text 250
Processing text 260
Processing text 270
Processing text 280
Processing text 290
Processing text 300
Processing text 310
Processing text 320
Processing text 330
Processing text 340
Processing text 350
Processing text 360
Processing text 370
Processing text 380
Processing text 390
Processing text 400
Processing text 410
Processing text 420
Processing text 430
Processing text 440
Processing text 450
Processing text 460
Processing text 470
Processing text 480
Processing text 490
Processing text 500
Processin

In [71]:
res

{'It breaks my heart to see people living in those conditions. I hope that all the aid that was sent to the island makes it to the people who need it the most. I do not know what I would do it that was my family and I. I would hope that I would do my best, but I can see how depressing and hopeless you could feel having your whole life changed because of a storm and not knowing where your next meal is coming from.': 'breaks my heart, people living, conditions, aid, island, need, family, depressing, hopeless, whole life changed, storm, next meal',
 "I wonder why there aren't more people trying to help these people. I understand Haiti is not the richest nor less corrupt country but surely there must be a way to help. Supplies being looted by crowds is understandable because they are hungry and people need food and water to survive. We must think of other ways to distribute the food and water.": 'people, help, Haiti, richest, corrupt country, supplies, looted, crowds, hungry, food, water, 

In [74]:
print(len(df_train), len(res))

792 792


In [75]:
df_train['gpt4_keywords'] = df_train[text_col].map( res )

In [77]:
start = time.time()
res2   = dict()
count = 0
for text in df_dev[text_col].values:
    if text in res2:
        continue
    prompt = prompt2.format(text)   
    try:
        res2[ text ] = submit_request( prompt )
    except Exception as e:
        print(f'\nText: {text}\nError: {e}\n')
                
    count += 1    
    if count % 10 == 0:
        print(f'Processing text {count}')                        
        
elapsed = (time.time() - start)/60
print(f'\nTime elapsed {round(elapsed, 4)} min')

Processing text 10
Processing text 20
Processing text 30
Processing text 40
Processing text 50
Processing text 60
Processing text 70
Processing text 80
Processing text 90
Processing text 100
Processing text 110
Processing text 120
Processing text 130
Processing text 140
Processing text 150
Processing text 160
Processing text 170
Processing text 180
Processing text 190
Processing text 200

Time elapsed 27.9552 min


In [79]:
print(len(df_dev), len(res2))

208 208


In [80]:
df_dev['gpt4_keywords'] = df_dev[text_col].map( res2 )

In [82]:
model

'gpt-4'

In [90]:
df_train['gpt4_keywords'].values[:10]

array(['breaks my heart, people living, conditions, aid, island, need, family, depressing, hopeless, whole life changed, storm, next meal',
       'people, help, Haiti, richest, corrupt country, supplies, looted, crowds, hungry, food, water, survive, distribute',
       'reading, article, sad, terrible, people, affected, hurricane, situation, deserve, mother nature, plans, children, animals, shelter, food',
       'sad, amazing story, freak accident, life, amazing triumphs, die, heart breaking, came from nothing, American dream, rare, crazy way',
       'reading, article, world, lost, kindhearted, generous person, drugs, alcohol, accident, crash, boat, life jacket, speed, rocks',
       'sad, find out, happened, controlling, drugs, system, young person, world, fingertips, loses, life, controllable, unfortunate',
       'reading, article, reaction, sad, boys, young, behind bars, children, experience, childhood, fun, age, facing hardships, playing, friends, school, locked up, cell',
    

In [83]:
file = 'data/df_train.pkl'
df_train.to_pickle( file )

file = 'data/df_dev.pkl'
df_dev.to_pickle( file )

## Results

Conclusions:
* Temperature = 0 consistently outperforms temp = 0.5
* Concatenating random examples outperforms concatenating 30 closest examples

### Concatenate random ~30 examples

__Experiment 10__:  __BEST__  
Column = 'spellchecked', temperature = 0  
No system role, __second Neutral removed__    
_Prompt_:  concatentaed examples (no list of instructions)
```
              precision    recall  f1-score   support

       Anger     0.5200    0.3421    0.4127        38
     Disgust     0.3846    0.6250    0.4762        24
        Fear     0.4375    0.8750    0.5833         8
        Hope     0.2667    0.5000    0.3478        16
         Joy     0.3333    0.5000    0.4000         2
     Neutral     0.7391    0.3148    0.4416        54
     Sadness     0.7132    0.9109    0.8000       101
    Surprise     0.3333    0.3333    0.3333         3

   micro avg     0.5746    0.6260    0.5992       246
   macro avg     0.4660    0.5501    0.4744       246
weighted avg     0.6113    0.6260    0.5845       246
 samples avg     0.6170    0.6370    0.6077       246
```

__Experiment 12__  
Same as 10, but the example dfs were exploded - to avoid examples with double categories  
This is an example of a case when ChatGPT learns one category, outputs one category per essay. Prompt is needed to make sure 2 categories are output when relevant (see Experiment 14)
```
              precision    recall  f1-score   support

       Anger     0.5000    0.3421    0.4063        38
     Disgust     0.5000    0.4583    0.4783        24
        Fear     0.3000    0.3750    0.3333         8
        Hope     0.2500    0.3125    0.2778        16
         Joy     0.5000    0.5000    0.5000         2
     Neutral     0.6800    0.3148    0.4304        54
     Sadness     0.8200    0.8119    0.8159       101
    Surprise     0.3333    0.3333    0.3333         3

   micro avg     0.6394    0.5407    0.5859       246
   macro avg     0.4854    0.4310    0.4469       246
weighted avg     0.6461    0.5407    0.5759       246
 samples avg     0.6394    0.5601    0.5865       246
 ```

__Experiment 14__
Same as 11, but the example dfs were exploded - to avoid examples with double categories  
```
              precision    recall  f1-score   support

       Anger     0.5600    0.3684    0.4444        38
     Disgust     0.5000    0.5833    0.5385        24
        Fear     0.2692    0.8750    0.4118         8
        Hope     0.2400    0.3750    0.2927        16
         Joy     0.0000    0.0000    0.0000         2
     Neutral     0.6875    0.2037    0.3143        54
     Sadness     0.7788    0.8020    0.7902       101
    Surprise     0.0000    0.0000    0.0000         3

   micro avg     0.5911    0.5407    0.5648       246
   macro avg     0.3794    0.4009    0.3490       246
weighted avg     0.6303    0.5407    0.5471       246
 samples avg     0.6058    0.5601    0.5681       246
```

__Experiment 11__:  
Column = 'spellchecked', temperature = 0  
No system role, __second Neutral removed__    
```
_Prompt_:  
1 - Below you are given examples of essays with categories separated by four hashtags.
2 - Each essay has one or two relevant categories from the following list: Sadness, Neutral, Anger, Disgust, Fear, Hope, Surprise, Joy.
3 - Your task is to carefully learn from the examples of essays with categories in order to understand what features or words in the essays make them belong to a specific category and then use this knowledge to assign the correct relevant category from the above list to the very last essay.
4 - You may add a second category from the above list ONLY AND ONLY IF it is also relevant to the very last essay.
5 - Output just the category or categories for the last essay and nothing else. If there are two relevant categories: sort them alphabetically, concatenate with a forward slash, and output only them and nothing else.

####

+ concatentaed examples

Followup:  
Are you sure about that? If yes - repeat the same output, if no - change the category, but make sure it's from the list of predefined categories


```
```
              precision    recall  f1-score   support

       Anger     0.6154    0.4211    0.5000        38
     Disgust     0.4054    0.6250    0.4918        24
        Fear     0.2857    0.7500    0.4138         8
        Hope     0.2759    0.5000    0.3556        16
         Joy     1.0000    0.5000    0.6667         2
     Neutral     0.8667    0.2407    0.3768        54
     Sadness     0.7154    0.9208    0.8052       101
    Surprise     0.0000    0.0000    0.0000         3

   micro avg     0.5779    0.6179    0.5972       246
   macro avg     0.5206    0.4947    0.4512       246
weighted avg     0.6539    0.6179    0.5805       246
 samples avg     0.6154    0.6346    0.6026       246
```

__Experiment 5__:  
Column = 'spellchecked', temperature = 0  
No system role, second Neutral not removed    

_Prompt_: concatentaed examples (no list of instructions)
```
              precision    recall  f1-score   support

       Anger     0.5000    0.3421    0.4063        38
     Disgust     0.3023    0.5417    0.3881        24
        Fear     0.4444    1.0000    0.6154         8
        Hope     0.2759    0.5000    0.3556        16
         Joy     0.2500    0.5000    0.3333         2
     Neutral     0.6667    0.3333    0.4444        54
     Sadness     0.7381    0.9208    0.8194       101
    Surprise     0.1250    0.3333    0.1818         3

   micro avg     0.5516    0.6301    0.5882       246
   macro avg     0.4128    0.5589    0.4430       246
weighted avg     0.5921    0.6301    0.5827       246
 samples avg     0.6010    0.6442    0.5986       246
```

__Experiment 4__  
Column = 'spellchecked', temperature = 0.5  
No system role, second Neutral not removed    

_Prompt_: concatentaed examples (no list of instructions)
```
              precision    recall  f1-score   support

       Anger     0.5600    0.3684    0.4444        38
     Disgust     0.3556    0.6667    0.4638        24
        Fear     0.3500    0.8750    0.5000         8
        Hope     0.2333    0.4375    0.3043        16
         Joy     0.2500    0.5000    0.3333         2
     Neutral     0.6538    0.3148    0.4250        54
     Sadness     0.7381    0.9208    0.8194       101
    Surprise     0.1250    0.3333    0.1818         3

   micro avg     0.5493    0.6341    0.5887       246
   macro avg     0.4082    0.5521    0.4340       246
weighted avg     0.5979    0.6341    0.5846       246
 samples avg     0.5913    0.6466    0.5929       246
 ```

__Experiment 3__  
Column = 'spellchecked', temperature = 0.5
No system role, second Neutral not removed    
Same prompt

```
             precision    recall  f1-score   support

       Anger     0.5000    0.3421    0.4063        38
     Disgust     0.3409    0.6250    0.4412        24
        Fear     0.2857    0.7500    0.4138         8
        Hope     0.2326    0.6250    0.3390        16
         Joy     0.0000    0.0000    0.0000         2
     Neutral     0.7037    0.3519    0.4691        54
     Sadness     0.6889    0.9208    0.7881       101
    Surprise     0.2000    0.3333    0.2500         3

   micro avg     0.5199    0.6382    0.5730       246
   macro avg     0.3690    0.4935    0.3884       246
weighted avg     0.5747    0.6382    0.5709       246
 samples avg     0.5697    0.6514    0.5841       246
```

__Experiment 2__  
Column = 'spellchecked', temperature = 0  
No system role used, second Neutral not removed      
Same prompt  
```
              precision    recall  f1-score   support

       Anger     0.5833    0.3684    0.4516        38
     Disgust     0.3684    0.5833    0.4516        24
        Fear     0.3810    1.0000    0.5517         8
        Hope     0.2143    0.5625    0.3103        16
         Joy     0.3333    0.5000    0.4000         2
     Neutral     0.6923    0.3333    0.4500        54
     Sadness     0.6861    0.9307    0.7899       101
    Surprise     0.2000    0.3333    0.2500         3

   micro avg     0.5372    0.6463    0.5867       246
   macro avg     0.4323    0.5765    0.4569       246
weighted avg     0.5912    0.6463    0.5813       246
 samples avg     0.5865    0.6587    0.5962       246
```

__Experiment 1__  
Column = 'spellchecked', temperature = 0  
Using system role: { "role": "system", "content": "You are a helpful emotion classifier.", }   
Second Neutral not removed  

_Prompt_  
```
1 - Below you are given examples of texts with their most relevant emotion categories. The examples are separated by four hashtags.
2 - Each text can belong to one or two most relevant emotion categories from the following list: Sadness, Neutral, Anger, Disgust, Fear, Hope, Surprise, Joy.
3 - Your task is to classify the last text by selecting the most relevant emotion category from the above list. You may add a second emotion category from the above list ONLY AND ONLY IF it is also relevant to the last text.
4 - Before you perform your task, carefully learn from the examples of texts with their categories and then classify the last text based on this acquired knowledge.
5 - Output just the category or categories for the last text and nothing else. If there are two relevant emotion categories: sort them alphabetically, concatenate with a forward slash, and output only them and nothing else.

+ concatentaed examples
```

```
              precision    recall  f1-score   support

       Anger     0.5357    0.3947    0.4545        38
     Disgust     0.3182    0.5833    0.4118        24
        Fear     0.3200    1.0000    0.4848         8
        Hope     0.2286    0.5000    0.3137        16
         Joy     0.0000    0.0000    0.0000         2
     Neutral     0.6250    0.2778    0.3846        54
     Sadness     0.6667    0.9109    0.7699       101
    Surprise     0.2000    0.3333    0.2500         3

   micro avg     0.5100    0.6220    0.5604       246
   macro avg     0.3618    0.5000    0.3837       246
weighted avg     0.5524    0.6220    0.5501       246
 samples avg     0.5481    0.6346    0.5673       246
 ```

### Concatenate closest 30 examples (cosine)

__Experiment 9__  
Exactly as experiment 8, but Neutral was removed from coupled emotion categories  
```
              precision    recall  f1-score   support

       Anger     0.5926    0.4211    0.4923        38
     Disgust     0.4412    0.6250    0.5172        24
        Fear     0.3636    0.5000    0.4211         8
        Hope     0.3214    0.5625    0.4091        16
         Joy     0.0000    0.0000    0.0000         2
     Neutral     0.6957    0.2963    0.4156        54
     Sadness     0.6714    0.9307    0.7801       101
    Surprise     0.0000    0.0000    0.0000         3

   micro avg     0.5789    0.6260    0.6016       246
   macro avg     0.3857    0.4169    0.3794       246
weighted avg     0.5957    0.6260    0.5783       246
 samples avg     0.6178    0.6394    0.6106       246
```

__Experiment 8__  
Column = 'essay_clean_spellchecked', tempereture = 0  
No system role, second Neutral not removed    
Prompt:  
```
1 - Below you are given examples of texts with their most relevant emotion categories. The examples are separated by four hashtags.
2 - Each text can belong to one or two most relevant emotion categories from the following list: Sadness, Neutral, Anger, Disgust, Fear, Hope, Surprise, Joy.
3 - Your task is to carefully learn from the examples of texts with their categories and then use the acquired knowledge to classify the very last text by selecting the most relevant emotion category from the above list.
4 - You may add a second emotion category from the above list ONLY AND ONLY IF it is also relevant to the very last text.
5 - Output just the category or categories for the last text and nothing else. If there are two relevant emotion categories: sort them alphabetically, concatenate with a forward slash, and output only them and nothing else.

 + concatenated examples
```

```
              precision    recall  f1-score   support

       Anger     0.5714    0.4211    0.4848        38
     Disgust     0.4688    0.6250    0.5357        24
        Fear     0.4000    0.5000    0.4444         8
        Hope     0.3333    0.5625    0.4186        16
         Joy     0.0000    0.0000    0.0000         2
     Neutral     0.7037    0.3519    0.4691        54
     Sadness     0.6690    0.9406    0.7819       101
    Surprise     0.0000    0.0000    0.0000         3

   micro avg     0.5896    0.6423    0.6148       246
   macro avg     0.3933    0.4251    0.3918       246
weighted avg     0.5978    0.6423    0.5928       246
 samples avg     0.6298    0.6611    0.6258       246
```

__Experiment 7__  
Column = 'essay_clean', tempereture = 0  
No system role, second Neutral not removed      
Prompt:  
```
1 - Below you are given examples of texts with their most relevant emotion categories. The examples are separated by four hashtags.
2 - Each text can belong to one or two most relevant emotion categories from the following list: Sadness, Neutral, Anger, Disgust, Fear, Hope, Surprise, Joy.
3 - Your task is to carefully learn from the examples of texts with their categories and then use the acquired knowledge to classify the very last text by selecting the most relevant emotion category from the above list.
4 - You may add a second emotion category from the above list ONLY AND ONLY IF it is also relevant to the very last text.
5 - Output just the category or categories for the last text and nothing else. If there are two relevant emotion categories: sort them alphabetically, concatenate with a forward slash, and output only them and nothing else.

 + concatenated examples
```

```
              precision    recall  f1-score   support

       Anger     0.5517    0.4211    0.4776        38
     Disgust     0.4375    0.5833    0.5000        24
        Fear     0.4000    0.5000    0.4444         8
        Hope     0.3214    0.5625    0.4091        16
         Joy     0.0000    0.0000    0.0000         2
     Neutral     0.7037    0.3519    0.4691        54
     Sadness     0.6738    0.9406    0.7851       101
    Surprise     0.0000    0.0000    0.0000         3

   micro avg     0.5836    0.6382    0.6097       246
   macro avg     0.3860    0.4199    0.3857       246
weighted avg     0.5929    0.6382    0.5889       246
 samples avg     0.6226    0.6538    0.6194       246
```

__Example 6__  
Column = 'essay_clean', tempereture = 0  
No system role, second Neutral not removed      
_Prompt_: concatenated examples (no list instructions)
```
              precision    recall  f1-score   support

       Anger     0.5357    0.3947    0.4545        38
     Disgust     0.4000    0.5000    0.4444        24
        Fear     0.5455    0.7500    0.6316         8
        Hope     0.2609    0.3750    0.3077        16
         Joy     0.0000    0.0000    0.0000         2
     Neutral     0.6129    0.3519    0.4471        54
     Sadness     0.6741    0.9010    0.7712       101
    Surprise     0.3333    0.3333    0.3333         3

   micro avg     0.5725    0.6098    0.5906       246
   macro avg     0.4203    0.4507    0.4237       246
weighted avg     0.5718    0.6098    0.5730       246
 samples avg     0.6106    0.6130    0.5962       246
```

## Appendix

In [30]:
# another way to  randomly split the data
X = df_train['essay_clean'].values
y = ['/'.join(i) for i in df_train['emotion'].values]

skf        = StratifiedKFold(n_splits=18, shuffle=True, random_state=random_state)
chunks_idx = [test_index for _, test_index in skf.split(X, y)]
print('Length X:', len(X))
print('Total chunks length:', sum([len(i) for i in chunks_idx]), '\n')

for ch in chunks_idx:
    print(df_train.loc[ch].explode('emotion')['emotion'].value_counts())
    messages = [ {'role': 'user', 'content': ' '.join(df_train.loc[ch]['essay_clean'].tolist())} ]
    num_tokens = num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301")
    print('Num tokens:', num_tokens)
    print('\n', '='*75, '\n', sep='')

Length X: 792
Total chunks length: 792 

Sadness     22
Neutral     12
Anger        6
Disgust      5
Hope         3
Fear         2
Surprise     1
Name: emotion, dtype: int64
Num tokens: 3480


Sadness     22
Neutral     14
Anger        6
Disgust      4
Surprise     2
Hope         2
Fear         1
Joy          1
Name: emotion, dtype: int64
Num tokens: 3819


Sadness     21
Neutral     14
Anger        5
Disgust      4
Hope         2
Surprise     2
Fear         2
Joy          1
Name: emotion, dtype: int64
Num tokens: 3744


Sadness     21
Neutral     14
Anger        6
Disgust      4
Hope         2
Fear         2
Joy          1
Surprise     1
Name: emotion, dtype: int64
Num tokens: 3946


Sadness     21
Neutral     14
Anger        7
Disgust      5
Hope         2
Fear         2
Surprise     1
Name: emotion, dtype: int64
Num tokens: 3801


Sadness     20
Neutral     13
Anger        8
Disgust      5
Hope         2
Surprise     2
Fear         2
Name: emotion, dtype: int64
Num tokens: 3777


Sa

