# Copy labels leaked from training set. Classify the rest using ChatGPT API

## The Association for Computational Linguistics
## WASSA 2023 Shared Task on Multi-Label and Multi-Class Emotion Classification on Code-Mixed Text Messages
See more details [here](https://codalab.lisn.upsaclay.fr/competitions/10864#learn_the_details)

In [1]:
import openai
import numpy as np
import pandas as pd
import sklearn
import re, os
import time
import zipfile, pickle
from typing import List
from copy import deepcopy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, RidgeClassifier, RidgeClassifierCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, multilabel_confusion_matrix
from openai.embeddings_utils import cosine_similarity
from tqdm.autonotebook import tqdm
import random
import tiktoken
import backoff
tqdm.pandas()

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 400)
#os.path.join()

  from tqdm.autonotebook import tqdm


In [39]:
def num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301"):
    '''Return number of tokens used in a list of messages for ChatGPT'''
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        #print("Warning: model not found. Using cl100k_base encoding.")
        encoding = tiktoken.get_encoding("cl100k_base")
    if model == "gpt-3.5-turbo":
        #print("Warning: gpt-3.5-turbo may change over time. Returning num tokens assuming gpt-3.5-turbo-0301.")
        return num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301")
    elif model == "gpt-4":
        #print("Warning: gpt-4 may change over time. Returning num tokens assuming gpt-4-0314.")
        return num_tokens_from_messages(messages, model="gpt-4-0314")
    elif model == "gpt-3.5-turbo-0301":
        tokens_per_message = 4  # every message follows <|start|>{role/name}\n{content}<|end|>\n
        tokens_per_name = -1  # if there's a name, the role is omitted
    elif model == "gpt-4-0314":
        tokens_per_message = 3
        tokens_per_name = 1
    else:
        raise NotImplementedError(f"""num_tokens_from_messages() is not implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens.""")
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
    return num_tokens

In [2]:
random_state = 47

# Load and Prepare Data

In [3]:
file1    = 'data/mcec_train_translated.pkl'
df_train = pd.read_pickle(file1)

file2    = 'data/mcec_dev_translated.pkl'
df_dev   = pd.read_pickle(file2)

file3    = 'data/mcec_test_embedded.pkl'
df_test  = pd.read_pickle(file3)

file4    = 'data/sample_submission/predictions_MCEC.csv'
sample_submission = pd.read_csv(file4)

print(df_train.shape, df_dev.shape, df_test.shape, sample_submission.shape)

(9530, 5) (1191, 12) (1191, 2) (1191, 1)


In [4]:
# submission format
print( type(sample_submission) )
sample_submission.head()

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,Emotion
0,neutral
1,neutral
2,pessimism
3,disgust
4,fear


In [5]:
print(df_train['emotion'].value_counts(), '\n')
df_train.head()

neutral         3262
trust           1118
joy             1022
optimism         880
anticipation     832
disgust          687
sadness          486
fear             453
anger            226
surprise         199
love             187
pessimism        178
Name: emotion, dtype: int64 



Unnamed: 0,text,emotion,translated_hi,translated_ur,gpt_embedding
0,Yes.I am in fyp lab cabin.but fyp presentation...,neutral,Yes.i am in fyp lab cabin.but fyp presentation...,Y. Um in Fap Lab Cabin. Butt Fap Presentations...,"[-0.005477939732372761, -0.01738985814154148, ..."
1,Yar insan ka bcha bn chawliyn na mar :p,joy,"Dude become a child of a human being, do not die.",Dude human beings do not die: P: P,"[0.0006696455529890954, -0.006965265609323978,..."
2,Terai uncle nai kahna hai kai ham nai to bahr ...,disgust,Your Uncle Nai says that we had sent out money,Your Ankali says that we sent out money and wa...,"[0.021171217784285545, -0.02109299972653389, 0..."
3,Yr ajao I m cming in the club,neutral,YR AJAO I'M Coming in the Club,Yer organs were the club,"[-0.010511564090847969, -0.02738134376704693, ..."
4,Mje wese Nimra ahmad ka Qur'aan ki aayaat k ba...,joy,Mje wes nimra ahmad ka qur'aan ki aayaat k bar...,Mje Wese Nimra Ahmad Ka Qur'aan Ki Aayaaat K B...,"[-0.0016743674641475081, -0.021855581551790237..."


In [6]:
print(df_dev['emotion'].value_counts(), '\n')
df_dev.head()

neutral         388
joy             131
trust           125
disgust         113
optimism        110
anticipation     94
sadness          62
fear             52
surprise         35
anger            35
pessimism        29
love             17
Name: emotion, dtype: int64 



Unnamed: 0,text,emotion,target,gtp_translated,translated_hi,translated_ur,text_clean,gpt_pred,gpt_pred_num,gpt_translated2,gpt_translated2_corrected,gpt_embedding
0,Tension lene ki koi baat ni,neutral,1,There's no need to take tension.,There is nothing to take tension,Any talk of taking tangoes,Tension lene ki koi baat ni,neutral,1,There's no need to worry.,There's no need to worry.,"[-0.00021548081713262945, 0.005029499996453524..."
1,Main ghar punch gya hun or ab spny laga hun,neutral,1,I have reached home and now I am going to sleep.,I have gone home punch and now I am Sapni,I have gone home punch and now dreams,Main ghar punch gya hun or ab spny laga hun,neutral,1,I have reached home and now I am going to sleep.,I have reached home and now I am going to sleep.,"[-0.0010164333507418633, -0.013282055966556072..."
2,Nai mje nai mili mail..mene check ki ti,pessimism,0,"I didn't receive any mail, I had checked.",Nai Maje Nai Mile Mail .. I checked,Ni Ni Ni Mille Mail,Nai mje nai mili mail .. mene check ki ti,neutral,1,I didn't receive any new mail. I had checked.,I didn't receive any new mail. I had checked.,"[-0.010691414587199688, -0.01292553823441267, ..."
3,Yr us din mai pura din bzy rahe vo mujy awne h...,disgust,0,"That day, they were busy all day and not givin...",YR Us Din Mai Pura Din Bzy Rahe Vo Mujy Awne H...,Yr us din mai pura din bzy rahe vo mujy awne h...,Yr us din mai pura din bzy rahe vo mujy awne h...,negative,0,"I was busy the whole day on that day, they wer...","I was busy the whole day on that day, they wer...","[0.009936108253896236, -0.016926730051636696, ..."
4,Lakin wo abhe dar dar ka chalata ha,fear,0,But he still walks cautiously.,But it still moves at the rate,But Wu runs the cedar,Lakin wo abhe dar dar ka chalata ha,neutral,1,But he still walks with fear and hesitation.,But he still walks with fear and hesitation.,"[0.019262924790382385, -0.0011249196249991655,..."


In [7]:
# light text cleaning (should I use clean regex for better accuracy?)
pad_punct    = re.compile('([^a-zA-Z ]+)')
multi_spaces = re.compile('\s{2,}')
#clean        = re.compile('[^a-zA-Z0-9,.?!\'\s]+')

def clean_text(s):
    s = s.replace('\n', ' ')
    s = pad_punct.sub(r' \1 ', s)
    #s = clean.sub(' ', s)
    s = multi_spaces.sub(' ', s)
    return s.strip()

df_train['text_clean'] = df_train['text'].apply( clean_text )
df_dev['text_clean']   = df_dev['text'].apply( clean_text )
df_test['text_clean']  = df_test['text'].apply( clean_text )

In [8]:
# 93 complete duplicates - can't reduce because this is a test set
print(df_test.shape)
temp1 = df_test[ df_test.duplicated(subset=['text_clean'], keep=False) ]
print(temp1.shape)

(1191, 3)
(93, 3)


In [9]:
cols = ['text', 'text_clean', 'emotion', 'gpt_embedding',]
df_combined = pd.concat([df_train[cols], df_dev[cols]])
print(df_train.shape, df_dev.shape, df_combined.shape)
df_combined.head()

(9530, 6) (1191, 12) (10721, 4)


Unnamed: 0,text,text_clean,emotion,gpt_embedding
0,Yes.I am in fyp lab cabin.but fyp presentation...,Yes . I am in fyp lab cabin . but fyp presenta...,neutral,"[-0.005477939732372761, -0.01738985814154148, ..."
1,Yar insan ka bcha bn chawliyn na mar :p,Yar insan ka bcha bn chawliyn na mar : p,joy,"[0.0006696455529890954, -0.006965265609323978,..."
2,Terai uncle nai kahna hai kai ham nai to bahr ...,Terai uncle nai kahna hai kai ham nai to bahr ...,disgust,"[0.021171217784285545, -0.02109299972653389, 0..."
3,Yr ajao I m cming in the club,Yr ajao I m cming in the club,neutral,"[-0.010511564090847969, -0.02738134376704693, ..."
4,Mje wese Nimra ahmad ka Qur'aan ki aayaat k ba...,Mje wese Nimra ahmad ka Qur ' aan ki aayaat k ...,joy,"[-0.0016743674641475081, -0.021855581551790237..."


In [10]:
def transfer_label_to_test(row):
    candidates = df_combined[ df_combined['text_clean']==row['text_clean'] ]['emotion'].tolist()
    if len(candidates) == 0:
        pass
    elif len(set(candidates)) == 1:
        row['Emotion'] = candidates[0]
    else:
        print(f"Row #{row.name}: combined labels are {candidates}")
    return row


df_test['Emotion'] = np.nan
df_test = df_test.apply( transfer_label_to_test, axis=1 )

print(df_test.isna().sum())
df_test['Emotion'].value_counts()

text               0
gpt_embedding      0
text_clean         0
Emotion          607
dtype: int64


neutral         194
trust            72
optimism         54
disgust          51
joy              51
anticipation     43
sadness          37
fear             32
pessimism        16
anger            13
surprise         11
love             10
Name: Emotion, dtype: int64

In [11]:
df_test.head()

Unnamed: 0,text,gpt_embedding,text_clean,Emotion
0,Razia bta rahe the but wo sure nahe the,"[0.00961806159466505, -0.009336333721876144, 0...",Razia bta rahe the but wo sure nahe the,
1,Me phr kuch parh hi lun :-P,"[0.004914113786071539, -0.011376334354281425, ...",Me phr kuch parh hi lun :- P,
2,Hoxtl life ma hm bht jald matur hO jaty hai,"[0.006094285752624273, -0.002856901613995433, ...",Hoxtl life ma hm bht jald matur hO jaty hai,fear
3,Yar A4 me seminar ha a ja..,"[-0.018210574984550476, -0.021075459197163582,...",Yar A 4 me seminar ha a ja ..,neutral
4,K.Quid e azam k 400 bnay hain,"[-0.0046786158345639706, -0.009765759110450745...",K . Quid e azam k 400 bnay hain,neutral


Overlap has already been used - no need to remove it from df_combined

In [21]:
file = 'data/df_test_with_leakedData_only.pkl'
df_test.to_pickle(file)

In [19]:
df_test[ df_test['Emotion'].isna() ].shape

(607, 4)

# Find 100 closest

In [22]:
# find top_n closest df_train/df_dev embeddings for each df_test embedding
def batch_cosine(embedding_, df, top_n=100):
    df['similarity'] = df['gpt_embedding'].apply(lambda x: cosine_similarity(x, embedding_))
    return df.sort_values(by='similarity', ascending=False).head(top_n)['text'].tolist()

df_combined_copy = df_combined.copy()
start = time.time()
res   = dict()
count = 0
for t, e in df_test[ df_test['Emotion'].isna() ][['text', 'gpt_embedding']].values:
    if t in res:
        continue
    res[ t ] = batch_cosine( e, df_combined_copy, top_n=100, )
    count += 1
    if count % 10 == 0:
        print(f'Processing text {count}. Time elapsed: {round((time.time()-start)/60, 4)} min')
        with open('data/res.pkl', 'wb') as f:
            pickle.dump(res, f, protocol=pickle.HIGHEST_PROTOCOL)
                        
elapsed = (time.time() - start)/60
print(f'\nTime elapsed {round(elapsed, 4)} min')

Processing text 10. Time elapsed: 0.5274 min
Processing text 20. Time elapsed: 1.0547 min
Processing text 30. Time elapsed: 1.5857 min
Processing text 40. Time elapsed: 2.1133 min
Processing text 50. Time elapsed: 2.6383 min
Processing text 60. Time elapsed: 3.1685 min
Processing text 70. Time elapsed: 3.6973 min
Processing text 80. Time elapsed: 4.2252 min
Processing text 90. Time elapsed: 4.7567 min
Processing text 100. Time elapsed: 5.2912 min
Processing text 110. Time elapsed: 5.9743 min
Processing text 120. Time elapsed: 6.6496 min
Processing text 130. Time elapsed: 7.2865 min
Processing text 140. Time elapsed: 7.9241 min
Processing text 150. Time elapsed: 8.5195 min
Processing text 160. Time elapsed: 9.0946 min
Processing text 170. Time elapsed: 9.6611 min
Processing text 180. Time elapsed: 10.229 min
Processing text 190. Time elapsed: 10.7761 min
Processing text 200. Time elapsed: 11.3072 min
Processing text 210. Time elapsed: 11.8384 min
Processing text 220. Time elapsed: 12.36

In [24]:
len(df_test[ df_test['Emotion'].isna() ]['text'].unique()), len(res)

(587, 587)

In [27]:
df_test['closest_texts'] = None
df_test['closest_texts'] = df_test['text'].apply( lambda x: res[x] if x in res else None )
print(df_test.isna().sum())

file = 'data/df_test_100_closest_GptEmbeddings.pkl'
df_test.to_pickle(file)

text               0
gpt_embedding      0
text_clean         0
Emotion          607
closest_texts    584
dtype: int64


In [28]:
df_test.head(50)

Unnamed: 0,text,gpt_embedding,text_clean,Emotion,closest_texts
0,Razia bta rahe the but wo sure nahe the,"[0.00961806159466505, -0.009336333721876144, 0...",Razia bta rahe the but wo sure nahe the,,"[Hn hn zaror ye raz hi rahe ga, Razia ne ma'am..."
1,Me phr kuch parh hi lun :-P,"[0.004914113786071539, -0.011376334354281425, ...",Me phr kuch parh hi lun :- P,,"[I phne ke sheshkay lea ja raha ha :-P , Han m..."
2,Hoxtl life ma hm bht jald matur hO jaty hai,"[0.006094285752624273, -0.002856901613995433, ...",Hoxtl life ma hm bht jald matur hO jaty hai,fear,
3,Yar A4 me seminar ha a ja..,"[-0.018210574984550476, -0.021075459197163582,...",Yar A 4 me seminar ha a ja ..,neutral,
4,K.Quid e azam k 400 bnay hain,"[-0.0046786158345639706, -0.009765759110450745...",K . Quid e azam k 400 bnay hain,neutral,
5,Hm sb b a jate nd aur b bare bnde b a jate! :-D,"[-0.009173348546028137, -0.0036938106641173363...",Hm sb b a jate nd aur b bare bnde b a jate ! :- D,,[Tu tnsn na le tje to kafi bnde churane aa jat...
6,Yr apnay senior ka method ya hota ha k wo all ...,"[0.012461582198739052, 0.004763346165418625, 0...",Yr apnay senior ka method ya hota ha k wo all ...,neutral,
7,thek ha sir i will be at your office at 1,"[-0.005030790343880653, -0.015968872234225273,...",thek ha sir i will be at your office at 1,,"[ok sir i will be there, Sir at what time i ca..."
8,Hahahaha tuj ma agr itni wafa ha to tu he sab ...,"[-0.003401822643354535, -0.008568023331463337,...",Hahahaha tuj ma agr itni wafa ha to tu he sab ...,,[Hahahah ab tu dost k liyee itna b nahi kr sak...
9,Well you didnt told me before that the meeting...,"[-0.005642556585371494, -0.004453025758266449,...",Well you didnt told me before that the meeting...,,[I got load of work to do . Thats why i will t...


# ChatGPT API: Few-Shot Classification

## Helper functions

In [29]:
openai.api_key = os.getenv("OPENAI_API_KEY")
model          = 'gpt-3.5-turbo'
labels_set     = { 'neutral', 'joy', 'trust', 'disgust', 'optimism', 'anticipation', 'sadness', 'fear',
                   'surprise', 'anger', 'pessimism', 'love', }
clean = re.compile(r'[^a-zA-Z ]+')
multi_spaces = re.compile('\s{2,}')
print(labels_set)

{'pessimism', 'fear', 'anticipation', 'optimism', 'surprise', 'neutral', 'love', 'sadness', 'disgust', 'anger', 'trust', 'joy'}


In [30]:
def verify_label(label_):
    '''
       Verify if label_ contains any of the categories
       from the predefined set of labels
    '''
    label_ = clean.sub(' ', label_)
    label_ = multi_spaces.sub(' ', label_).lower().split()
    res    = [i for i in label_ if i in labels_set]
    res    = list(set(res))
    return '/'.join(res) if res else None

In [31]:
def verify_num_tokens(model, messages):
    '''Check that there is enough tokens available for a ChatGPT repsonse'''
    num_tokens_tiktoken = num_tokens_from_messages(messages, model)
    if num_tokens_tiktoken > 4080:
        print(f'Number of tokens is {num_tokens_tiktoken} which exceeds 4080')        
        return False
    else:
        return True


@backoff.on_exception(backoff.expo, openai.error.RateLimitError, max_time=10)
def get_response(model, messages, temperature=0, max_tokens=None):
    '''Send request, return reponse'''
    response  = openai.ChatCompletion.create(
        model = model,
        messages = messages,
        temperature = temperature,        # range(0,2), the more the less deterministic / focused
        top_p = 1,                        # top probability mass, e.g. 0.1 = only tokens from top 10% proba mass
        n = 1,                            # number of chat completions
        #max_tokens = max_tokens,          # tokens to return
        stream = False,        
        stop=None,                        # sequence to stop generation (new line, end of text, etc.)
        )
    content = response['choices'][0]['message']['content'].strip()
    #num_tokens_api = response['usage']['prompt_tokens']
    return content

## Approach 3: concatenate the closest few shot examples using the ChatGPT chat mode

In [32]:
model          = 'gpt-3.5-turbo'
embedding_type = 'gpt_embedding'
initial_prompt = 'Learn from the following examples of texts with assigned categories. ' +\
                 'Using this knowledge, select the most relevant category for the very last text below ' +\
                 'from the following list of predefined categories: neutral, joy, trust, disgust, optimism, ' +\
                 'anticipation, sadness, fear, surprise, anger, pessimism, love. ' +\
                 'Output only one most relevant category from the above list of predefined categories ' +\
                 'for the very last text below.'
print(initial_prompt)

# Using followup questions improves the reponse. but ChatGPT can change its mind too easily sometimes
followup1 = "Are you sure about that? If yes, output the same category, if no change the category, but make sure it's from the list of predefined categories"
print(followup1)

Learn from the following examples of texts with assigned categories. Using this knowledge, select the most relevant category for the very last text below from the following list of predefined categories: neutral, joy, trust, disgust, optimism, anticipation, sadness, fear, surprise, anger, pessimism, love. Output only one most relevant category from the above list of predefined categories for the very last text below.
Are you sure about that? If yes, output the same category, if no change the category, but make sure it's from the list of predefined categories


In [33]:
# Using top_n closest embeddings, create ChatGPT messages object (alternating user (text)/assistant(category) Q&As)
def create_messages(df_, closest_texts, top_n=100 ):
    df_temp = df_[ df_['text'].isin(closest_texts[:top_n]) ]    
    text0, emotion0 = df_temp[['text', 'emotion']].values[0]
    messages = [ { "role": "system", "content": "You are a helpful text classifier.", },
                 { "role": "user", "content": initial_prompt + f' Text: {text0}', },
                 { "role": "assistant", "content": f'Category: {emotion0}', }
               ]    
    for text, emotion in df_temp[['text', 'emotion']].values[1:]:                  # emotion instead of target here
        messages += [
            { "role": "user", "content": f'Text: {text}', },
            { "role": "assistant", "content": f'Category: {emotion}', }
        ]
    while num_tokens_from_messages(messages) > 4000:
        messages = messages[:-2]
    return messages

In [34]:
def classify_text_few_shot3(text_, messages_, model):
    '''Classify text_ using prompt_ and ChatGPT API'''
    messages_ = deepcopy(messages_)
    messages_ += [
        { "role": "user", "content": f'Text: {text_}', },
    ]
    while num_tokens_from_messages(messages_) > 4000:
        messages_ = messages_[:-3] + [messages_[-1]]    # remove 1 user/assistant iteration before last just added
    #if not verify_num_tokens(model, messages_): return None
    label_    = get_response(model, messages_)
    old_label = label_
    label_    = verify_label(label_)        # get just the category if response is too long
        
    # if label not found in response text - second, extended chat
    if label_ is None:
        messages_ += [
            { "role": "assistant", "content": old_label, },
            { "role": "user", "content": followup1, }
            ]        
        label_    = get_response(model, messages_)
        old_label = label_
        label_    = verify_label(label_)        # get just the category if response is too long
            
    return label_ if label_ is not None else old_label

In [40]:
# how one messages object looks
c = df_test['closest_texts'].values[5]
print(c)
create_messages(df_combined, c, top_n=10)

['Tu tnsn na le tje to kafi bnde churane aa jate! :-D  ', 'Uff etne se bt pa maun bnaya va ha janu na g :-P ', 'Hm tou agaye ab pahly kahti :-P ', 'Tay tsi zarur okhi krnii ay??? :-P ', 'I phne ke sheshkay lea ja raha ha :-P ', 'Hehehehe aj ux ke alaw beztie hoe :-P ', 'Hahaha bht buri ho :-O mene nai ana :-|', 'Puch to leti jane se pehle! Tu ne b mje lift nae krai! :-P ', 'Oh par ziada br hote ha na jani g. Pata ha ham dono ko ek doxra ke awax change krte ha :-D ', 'Dil karta ha uxa dkhtae he rhe :-P ', 'Hmmmmm aj maina date mare ux ka sath :-P ', 'Hmmmmm aj maina date mare ux ka sath :-P ', 'Janu aj a jao na milna :-D ', 'Janu aj a jao na milna :-D ', 'Tere khne k ilawa b krta rhta :P ', 'Me ny kab btaya tha py :O ', 'Me ny kab btaya tha py :O ', 'Me q pchy rhna c :-D ', 'Me bht bara item piece hn :-D', 'Ifsos hta ha. .aur bht hta hai :|', 'Hahaha bch k rehna :-P meri tarah bimar na hojana :-P', 'Yr jb meray 220 hjayen mje bta daina :-D ', 'Yr jb meray 220 hjayen mje bta daina :-D ',

[{'role': 'system', 'content': 'You are a helpful text classifier.'},
 {'role': 'user',
  'content': 'Learn from the following examples of texts with assigned categories. Using this knowledge, select the most relevant category for the very last text below from the following list of predefined categories: neutral, joy, trust, disgust, optimism, anticipation, sadness, fear, surprise, anger, pessimism, love. Output only one most relevant category from the above list of predefined categories for the very last text below. Text: Uff etne se bt pa maun bnaya va ha janu na g :-P '},
 {'role': 'assistant', 'content': 'Category: joy'},
 {'role': 'user', 'content': 'Text: Hahaha bht buri ho :-O mene nai ana :-|'},
 {'role': 'assistant', 'content': 'Category: joy'},
 {'role': 'user',
  'content': 'Text: Oh par ziada br hote ha na jani g. Pata ha ham dono ko ek doxra ke awax change krte ha :-D '},
 {'role': 'assistant', 'content': 'Category: anticipation'},
 {'role': 'user',
  'content': 'Text: Puc

In [44]:
# this simple iteration is faster than pandas df with tqdm
model = 'gpt-3.5-turbo'
start = time.time()
#res   = dict()
count = 0
for t, closest in df_test[ df_test['Emotion'].isna() ][['text', 'closest_texts']].values:
    if t in res:
        continue
    messages = create_messages(df_combined, closest, top_n=100)
    try:
        res[ t ] = classify_text_few_shot3(t, messages, model)
    except openai.error.RateLimitError:
        print(f'\nText: {t}.\nRate limit error\n')
    except Exception as e:
        print(f'\nText: {t}\nError: {e}\n')
                
    count += 1    
    if count % 10 == 0:
        print(f'Processing text {count}. Time elapsed: {round((time.time()-start)/60, 4)} min')
        with open('data/res.pkl', 'wb') as f:
            pickle.dump(res, f, protocol=pickle.HIGHEST_PROTOCOL)
                        
        
elapsed = (time.time() - start)/60
print(f'\nTime elapsed {round(elapsed, 4)} min')
#file = 'data/res.pkl'
#with open(file, 'rb') as f:
#    res2 = pickle.load(handle)


Time elapsed 0.0273 min


In [45]:
len(df_test[ df_test['Emotion'].isna() ]['text'].unique()), len(res)

(587, 587)

#### If one label is the output: classify_text_few_shot3()

In [46]:
df_test.head(50)

Unnamed: 0,text,gpt_embedding,text_clean,Emotion,closest_texts
0,Razia bta rahe the but wo sure nahe the,"[0.00961806159466505, -0.009336333721876144, 0...",Razia bta rahe the but wo sure nahe the,,"[Hn hn zaror ye raz hi rahe ga, Razia ne ma'am..."
1,Me phr kuch parh hi lun :-P,"[0.004914113786071539, -0.011376334354281425, ...",Me phr kuch parh hi lun :- P,,"[I phne ke sheshkay lea ja raha ha :-P , Han m..."
2,Hoxtl life ma hm bht jald matur hO jaty hai,"[0.006094285752624273, -0.002856901613995433, ...",Hoxtl life ma hm bht jald matur hO jaty hai,fear,
3,Yar A4 me seminar ha a ja..,"[-0.018210574984550476, -0.021075459197163582,...",Yar A 4 me seminar ha a ja ..,neutral,
4,K.Quid e azam k 400 bnay hain,"[-0.0046786158345639706, -0.009765759110450745...",K . Quid e azam k 400 bnay hain,neutral,
5,Hm sb b a jate nd aur b bare bnde b a jate! :-D,"[-0.009173348546028137, -0.0036938106641173363...",Hm sb b a jate nd aur b bare bnde b a jate ! :- D,,[Tu tnsn na le tje to kafi bnde churane aa jat...
6,Yr apnay senior ka method ya hota ha k wo all ...,"[0.012461582198739052, 0.004763346165418625, 0...",Yr apnay senior ka method ya hota ha k wo all ...,neutral,
7,thek ha sir i will be at your office at 1,"[-0.005030790343880653, -0.015968872234225273,...",thek ha sir i will be at your office at 1,,"[ok sir i will be there, Sir at what time i ca..."
8,Hahahaha tuj ma agr itni wafa ha to tu he sab ...,"[-0.003401822643354535, -0.008568023331463337,...",Hahahaha tuj ma agr itni wafa ha to tu he sab ...,,[Hahahah ab tu dost k liyee itna b nahi kr sak...
9,Well you didnt told me before that the meeting...,"[-0.005642556585371494, -0.004453025758266449,...",Well you didnt told me before that the meeting...,,[I got load of work to do . Thats why i will t...


In [48]:
# if one label is the output
def transfer_label(row):
    if not isinstance(row['Emotion'], str) and pd.isnull(row['Emotion']):
        row['Emotion'] = res.get(row['text'])
    return row
    
df_test = df_test.apply( transfer_label, axis=1 )

text                         0
emotion                      0
target                       0
gtp_translated               0
translated_hi                0
translated_ur                0
text_clean                   0
gpt_pred                     0
gpt_pred_num                 0
gpt_translated2              0
gpt_translated2_corrected    0
gpt_embedding                0
dtype: int64
neutral     1019
negative      88
positive      84
Name: gpt_pred, dtype: int64


In [50]:
print(df_test.isna().sum())
print(df_test['Emotion'].value_counts())

text               0
gpt_embedding      0
text_clean         0
Emotion            0
closest_texts    584
dtype: int64
neutral                     392
trust                       164
anticipation                124
joy                         122
optimism                    100
disgust                      78
sadness                      64
fear                         46
pessimism                    29
anger                        28
love                         23
surprise                     18
Category: disappointment      2
Category: uncertainty         1
Name: Emotion, dtype: int64


In [52]:
file = 'data/predictions_MCEC.csv'
df_test['Emotion'].to_csv( file, index=False, encoding='utf-8' )

The following corrections were made manually in the submission2_anedilko.csv file:
* Row 187, Category: disappointment => anger
* Row x, Category: disappointment => sadness
* Row x, Category: uncertainty => pessimism