# ChatGPT API: Zero-Shot Text Classification (Three Sentiment Classes)
## The Association for Computational Linguistics
## WASSA 2023 Shared Task on Multi-Label and Multi-Class Emotion Classification on Code-Mixed Text Messages

In [1]:
import openai
import numpy as np
import pandas as pd
import sklearn
import re, os
import time
import zipfile
from typing import List
from copy import deepcopy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, RidgeClassifier, RidgeClassifierCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, multilabel_confusion_matrix
from tqdm.autonotebook import tqdm
import tiktoken
tqdm.pandas()

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 400)
#os.path.join()

  from tqdm.autonotebook import tqdm


In [2]:
def num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301"):
    '''Return number of tokens used in a list of messages for ChatGPT'''
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        #print("Warning: model not found. Using cl100k_base encoding.")
        encoding = tiktoken.get_encoding("cl100k_base")
    if model == "gpt-3.5-turbo":
        #print("Warning: gpt-3.5-turbo may change over time. Returning num tokens assuming gpt-3.5-turbo-0301.")
        return num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301")
    elif model == "gpt-4":
        #print("Warning: gpt-4 may change over time. Returning num tokens assuming gpt-4-0314.")
        return num_tokens_from_messages(messages, model="gpt-4-0314")
    elif model == "gpt-3.5-turbo-0301":
        tokens_per_message = 4  # every message follows <|start|>{role/name}\n{content}<|end|>\n
        tokens_per_name = -1  # if there's a name, the role is omitted
    elif model == "gpt-4-0314":
        tokens_per_message = 3
        tokens_per_name = 1
    else:
        raise NotImplementedError(f"""num_tokens_from_messages() is not implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens.""")
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
    return num_tokens

In [3]:
# new new version (Dec 2022)
def upsample_all( df_, labels_col='target', random_state=47 ):
    '''
        Upsample each class in column labels_col of pandas dataframe df_
        to the number of data points in majority class
    '''
    # get sub-dataframes for each class & max length
    labels = df_[labels_col].unique()
    dframes, df_lengths = dict(), dict()
    for i in labels:
        temp          = df_[ df_[labels_col] == i ]
        dframes[i]    = temp.copy()
        df_lengths[i] = len(temp)

    max_len = max( list(df_lengths.values()) )
    df_lengths = {k: max_len-v for k,v in df_lengths.items()}                     # difference - how many to resample

    # upsample with replacement to max length
    for i in labels:
        if df_lengths[i] == max_len:
            dframes[i] = dframes[i].sample( frac=1, random_state=random_state )      # we know it's overrepresented
        else:
            if len(dframes[i]) >= df_lengths[i]:
                replace = False                                                      # enough data points
            else:
                replace = True
            temp = dframes[i].sample( df_lengths[i], replace=replace, random_state=random_state )
            dframes[i] = pd.concat( [dframes[i].copy(), temp.copy()] )               # df len + (max_len-df len)
            dframes[i] = dframes[i].sample( frac=1, random_state=random_state )      # shuffle

    # combine and reshuffle
    df_merged = pd.concat( list(dframes.values()) )
    df_merged = df_merged.sample( frac=1, random_state=random_state ).reset_index(drop=True)

    return df_merged

In [4]:
random_state = 47

# Load and Prepare Data

In [5]:
file1    = 'data/mcec_train_translated.pkl'
df_train = pd.read_pickle(file1)

file2    = 'data/mcec_dev_translated.pkl'
df_dev   = pd.read_pickle(file2)

file3    = 'data/mcec_test.csv'
df_test  = pd.read_csv(file3)

file4    = 'data/sample_submission/predictions_MCEC.csv'
sample_submission = pd.read_csv(file4)

print(df_train.shape, df_dev.shape, df_test.shape, sample_submission.shape)

(9530, 4) (1191, 10) (1191, 1) (1191, 1)


In [6]:
# submission format
print( type(sample_submission) )
sample_submission.head()

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,Emotion
0,neutral
1,neutral
2,pessimism
3,disgust
4,fear


In [7]:
label2key = {
    'neutral': 1,
    'trust': 2,
    'joy': 2,
    'optimism': 2,
    'anticipation': 2,
    'disgust': 0,
    'sadness': 0,
    'fear': 0,
    'anger': 0,
    'surprise': 2,
    'love': 2,
    'pessimism': 0,    
}

In [8]:
df_train['target'] = df_train['emotion'].map( label2key )
df_dev['target']   = df_dev['emotion'].map( label2key )

In [9]:
print(df_train['emotion'].value_counts(), '\n')
print(df_train['target'].value_counts())
df_train.head()

neutral         3262
trust           1118
joy             1022
optimism         880
anticipation     832
disgust          687
sadness          486
fear             453
anger            226
surprise         199
love             187
pessimism        178
Name: emotion, dtype: int64 

2    4238
1    3262
0    2030
Name: target, dtype: int64


Unnamed: 0,text,emotion,translated_hi,translated_ur,target
0,Yes.I am in fyp lab cabin.but fyp presentation...,neutral,Yes.i am in fyp lab cabin.but fyp presentation...,Y. Um in Fap Lab Cabin. Butt Fap Presentations...,1
1,Yar insan ka bcha bn chawliyn na mar :p,joy,"Dude become a child of a human being, do not die.",Dude human beings do not die: P: P,2
2,Terai uncle nai kahna hai kai ham nai to bahr ...,disgust,Your Uncle Nai says that we had sent out money,Your Ankali says that we sent out money and wa...,0
3,Yr ajao I m cming in the club,neutral,YR AJAO I'M Coming in the Club,Yer organs were the club,1
4,Mje wese Nimra ahmad ka Qur'aan ki aayaat k ba...,joy,Mje wes nimra ahmad ka qur'aan ki aayaat k bar...,Mje Wese Nimra Ahmad Ka Qur'aan Ki Aayaaat K B...,2


In [10]:
print(df_dev['emotion'].value_counts(), '\n')
print(df_dev['target'].value_counts())
df_dev.head()

neutral         388
joy             131
trust           125
disgust         113
optimism        110
anticipation     94
sadness          62
fear             52
surprise         35
anger            35
pessimism        29
love             17
Name: emotion, dtype: int64 

2    512
1    388
0    291
Name: target, dtype: int64


Unnamed: 0,text,emotion,target,gtp_translated,translated_hi,translated_ur,text_clean,gpt_pred,gpt_pred_num,gpt_translated2
0,Tension lene ki koi baat ni,neutral,1,There's no need to take tension.,There is nothing to take tension,Any talk of taking tangoes,Tension lene ki koi baat ni,neutral,1,There's no need to worry.
1,Main ghar punch gya hun or ab spny laga hun,neutral,1,I have reached home and now I am going to sleep.,I have gone home punch and now I am Sapni,I have gone home punch and now dreams,Main ghar punch gya hun or ab spny laga hun,neutral,1,I have reached home and now I am going to sleep.
2,Nai mje nai mili mail..mene check ki ti,pessimism,0,"I didn't receive any mail, I had checked.",Nai Maje Nai Mile Mail .. I checked,Ni Ni Ni Mille Mail,Nai mje nai mili mail .. mene check ki ti,neutral,1,I didn't receive any new mail. I had checked.
3,Yr us din mai pura din bzy rahe vo mujy awne h...,disgust,0,"That day, they were busy all day and not givin...",YR Us Din Mai Pura Din Bzy Rahe Vo Mujy Awne H...,Yr us din mai pura din bzy rahe vo mujy awne h...,Yr us din mai pura din bzy rahe vo mujy awne h...,negative,0,"I was busy the whole day on that day, they wer..."
4,Lakin wo abhe dar dar ka chalata ha,fear,0,But he still walks cautiously.,But it still moves at the rate,But Wu runs the cedar,Lakin wo abhe dar dar ka chalata ha,neutral,1,But he still walks with fear and hesitation.


In [11]:
# light text cleaning (should I use clean regex for better accuracy?)
pad_punct    = re.compile('([^a-zA-Z ]+)')
multi_spaces = re.compile('\s{2,}')
#clean        = re.compile('[^a-zA-Z0-9,.?!\'\s]+')

def clean_text(s):
    s = s.replace('\n', ' ')
    s = pad_punct.sub(r' \1 ', s)
    #s = clean.sub(' ', s)
    s = multi_spaces.sub(' ', s)
    return s.strip()

df_train['text_clean'] = df_train['text'].apply( clean_text )
df_dev['text_clean']   = df_dev['text'].apply( clean_text )
df_test['text_clean']  = df_test['Text'].apply( clean_text )

In [12]:
# 2K duplicates - these may affect claa imbalance during training! TO BE REDUCED
print(df_train.shape)
temp1 = df_train[ df_train.duplicated(subset=['text_clean'], keep=False) ]
print(temp1.shape)
temp2 = df_train[ df_train.duplicated(subset=['text_clean', 'emotion'], keep=False) ]
print(temp2.shape)
temp3 = df_train[ df_train.duplicated(keep=False) ]
print(temp3.shape)

(9530, 6)
(4222, 6)
(4221, 6)
(4221, 6)


In [14]:
# 82 duplicates ['clean_text', 'emotion'] - can't reduce because this is a dev set
print(df_dev.shape)
temp1 = df_dev[ df_dev.duplicated(subset=['text_clean'], keep=False) ]
print(temp1.shape)
temp2 = df_dev[ df_dev.duplicated(subset=['text_clean', 'emotion'], keep=False) ]
print(temp2.shape)
temp3 = df_dev[ df_dev.duplicated(keep=False) ]
print(temp3.shape)

(1191, 10)
(82, 10)
(82, 10)
(76, 10)


In [15]:
# 93 complete duplicates - can't reduce because this is a test set
print(df_test.shape)
temp1 = df_test[ df_test.duplicated(subset=['text_clean'], keep=False) ]
print(temp1.shape)
temp3 = df_test[ df_test.duplicated(keep=False) ]
print(temp3.shape)

(1191, 2)
(93, 2)
(93, 2)


In [16]:
# df_train vs. df_dev: half of the dev set is in train set
overlap1 = [t for t in df_train['text_clean'].values if t in df_dev['text_clean'].values]
overlap2 = [t for t in df_dev['text_clean'].values if t in df_train['text_clean'].values]
len(overlap1), len(overlap2), len(set(overlap1)), len(set(overlap2))

(714, 554, 526, 526)

In [18]:
# df_test vs. rest
overlap3 = [ t for t in df_train['text_clean'].tolist() + df_dev['text_clean'].tolist()\
             if t in df_test['text_clean'].tolist() ]
overlap4 = [ t for t in df_test['text_clean'].tolist() if t in\
             df_train['text_clean'].tolist() + df_dev['text_clean'].tolist() ]
len(overlap3), len(overlap4), len(set(overlap3)), len(set(overlap4))

(817, 584, 557, 557)

In [19]:
# df_test vs. df_dev: half of the dev set is in train set
overlap5 = [t for t in df_dev['text_clean'].values if t in df_test['text_clean'].values]
overlap6 = [t for t in df_test['text_clean'].values if t in df_dev['text_clean'].values]
len(overlap5), len(overlap6), len(set(overlap5)), len(set(overlap6))

(90, 97, 88, 88)

In [20]:
# df_test vs. df_train: half of the dev set is in train set
overlap7 = [t for t in df_train['text_clean'].values if t in df_test['text_clean'].values]
overlap8 = [t for t in df_test['text_clean'].values if t in df_train['text_clean'].values]
len(overlap7), len(overlap8), len(set(overlap7)), len(set(overlap8))

(727, 540, 519, 519)

The reason why baseline ML models perform better than ChatGPT is because they get a lot of hints due to duplicates from the training set! ChatGPT doesn't have this knowledge because it's doing a zero-shot classification! The number of duplicates is such that they would not fit the context window of ChatGPT anyway.

The only way to compare ML and ChatGPT correctly is to remove all the duplicates from the TRAINING SET, then train ML model and test it the dev set and compare with ChatGPT! (also, deduplicate the training set)

Submission: use non-overfit ML or ChatGPT (whichever is better) on those samples from the test set that don't have duplicates in the training or dev set. Use training/dev set labels for the duploicates in the test set.

In [21]:
# remove overlap with validation sets
val_sets = df_dev['text_clean'].tolist() + df_test['text_clean'].tolist()
print(len(val_sets), len(set(val_sets)))

print(df_train.shape)
df_train = df_train[ ~df_train['text_clean'].isin(val_sets) ]
print(df_train.shape)

2382 2206
(9530, 6)
(8151, 6)


In [22]:
# remove duplicates from train set
df_train = df_train.drop_duplicates(subset=['text_clean', 'emotion'])
print(df_train.shape)

(6167, 6)


In [23]:
# is additional text cleaning necessary? I don't see why
from collections import Counter
train_words = ' '.join( df_train['text_clean'].tolist() ).lower().split()
c = Counter( train_words )
c.most_common(350)

[('.', 2127),
 ('k', 1244),
 ('to', 1231),
 ('ha', 1214),
 ('hai', 804),
 ('ho', 793),
 ('ka', 726),
 ('me', 640),
 ('?', 615),
 ('b', 604),
 ('kr', 568),
 ('ga', 559),
 ('ni', 553),
 ('ko', 543),
 ('ki', 532),
 ('tha', 528),
 (',', 518),
 ('...', 502),
 ('na', 497),
 ('hn', 473),
 ('hy', 464),
 ('wo', 461),
 ('ma', 453),
 ('nai', 450),
 ('..', 450),
 ('a', 446),
 ('se', 415),
 ('p', 409),
 ('yar', 401),
 ('or', 392),
 ('yr', 389),
 ('h', 388),
 ('i', 385),
 ('han', 385),
 ('tu', 371),
 ('e', 331),
 (':', 327),
 ('ne', 324),
 ('kia', 321),
 ('he', 287),
 ('hain', 284),
 ('main', 281),
 ('ab', 254),
 ('koi', 252),
 ('us', 251),
 ('nae', 250),
 ('ap', 250),
 ('sir', 250),
 ('sy', 248),
 ('tm', 237),
 ('is', 223),
 ('nahi', 223),
 ('hi', 222),
 ('raha', 220),
 ('kal', 218),
 ('rha', 214),
 ('ja', 202),
 ('ny', 200),
 ('aj', 199),
 ('g', 199),
 ('m', 198),
 ('phr', 195),
 (':-', 193),
 ('aur', 192),
 ('mai', 192),
 ('....', 187),
 ('gya', 184),
 ('d', 183),
 ('bht', 181),
 ('u', 173),
 ('p

In [24]:
# https://www.kaggle.com/code/owaisraza009/roman-urdu-sentiment-analysis/notebook
stopwords1 = [ 'ai', 'ayi', 'hy', 'hai', 'main', 'ki', 'tha', 'koi', 'ko', 'sy', 'woh', 'bhi', 'aur', 'wo', 'yeh',
               'rha', 'hota', 'ho', 'ga', 'ka', 'le', 'lye', 'kr', 'kar', 'lye', 'liye', 'hotay', 'waisay', 'gya',
               'gaya', 'kch', 'ab', 'thy', 'thay', 'houn', 'hain', 'han', 'to', 'is', 'hi', 'jo', 'kya', 'thi', 'se',
               'pe', 'phr', 'wala', 'waisay', 'us', 'na', 'ny', 'hun', 'rha', 'raha', 'ja', 'rahay', 'abi', 'uski',
               'ne', 'haan', 'acha', 'nai', 'sent', 'photo', 'you', 'kafi', 'gai', 'rhy', 'kuch', 'jata', 'aye', 'ya',
               'dono', 'hoa', 'aese', 'de', 'wohi', 'jati', 'jb', 'krta', 'lg', 'rahi', 'hui', 'karna', 'krna', 'gi',
               'hova', 'yehi', 'jana', 'jye', 'chal', 'mil', 'tu', 'hum', 'par', 'hay', 'kis', 'sb', 'gy', 'dain',
               'krny', 'tou', ]

# https://github.com/haseebelahi/roman-urdu-stopwords.git
file = 'data/stopwords.txt'
stopwords2 = open(file).read().split()
print(stopwords2 == stopwords1)

from sklearn.feature_extraction import _stop_words
stopwords_en  = _stop_words.ENGLISH_STOP_WORDS
# selected from stopwords_en
stopwords_en2 = [ 'a', 'about', 'also', 'am', 'an', 'and', 'are', 'as', 'at', 'be', 
                  'been', 'being', 'by', 'co', 'con', 'de', 'eg', 'eight', 'eleven', 'else', 'etc', 
                  'fifteen', 'fifty', 'five', 'for', 'forty', 'four', 'from', 'had',
                  'has', 'hasnt', 'have', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 
                  'his', 'how', 'i', 'ie', 'if', 'in', 'inc', 'into', 'is', 'it', 'its', 'itself',
                  'ltd', 'me', 'mine', 'my', 'myself', 'nine', 'no', 'now', 'of', 'off', 'on',
                  'once', 'one', 'onto', 'or', 'other', 'others', 'our', 'ours', 'ourselves',
                  'out', 'part', 'per', 're', 'several', 'she', 'side', 'since', 'six', 'sixty',
                  'so', 'ten', 'than', 'that', 'the', 'their', 'them',
                  'themselves', 'then', 'there', 'these', 'they', 'thick', 'thin', 'third', 'this', 'those', 
                  'three', 'to', 'twelve', 'twenty', 'two', 'un','us', 'very',
                  'via', 'was', 'we', 'were', 'what', 'when', 'where', 'whether', 'which', 'while', 
                  'who', 'whom', 'whose', 'why', 'with', 'within', 'would', 'yet', 'you', 'your', 'yours',
                   'yourself', 'yourselves', ]

print( len(stopwords1), len(stopwords_en), len(stopwords_en2), )

True
102 318 129


In [25]:
# upsample neutral
print('df_train before upsampling:\n', df_train['target'].value_counts(), sep='')

df_train = upsample_all( df_train.copy(), random_state=random_state )

print('df_train after upsampling:\n', df_train['target'].value_counts(), sep='')

df_train before upsampling:
2    2805
1    2097
0    1265
Name: target, dtype: int64
df_train after upsampling:
0    2805
1    2805
2    2805
Name: target, dtype: int64


In [26]:
X_train = df_train['text_clean'].values
y_train = df_train['target'].values

X_dev = df_dev['text_clean'].values
y_dev = df_dev['target'].values

X_train, y_train = sklearn.utils.shuffle( X_train, y_train, random_state=random_state, ) 
print( 'Shape of datasets: ', X_train.shape, y_train.shape, X_dev.shape, y_dev.shape, '\n')
print(X_train[:5], y_train[:5], '\n')
print(X_dev[:5], y_dev[:5])

Shape of datasets:  (8415,) (8415,) (1191,) (1191,) 

['Mrra lapi ni kam kr ra sahi' 'mera intzar kr me a rha hon !!.'
 'Yr shor bht aa rha he'
 'Waqas kitni dair ma a rahe ho yar juldi a jao yar'
 'kahty kitna prhna bs kr do'] [0 2 0 0 1] 

['Tension lene ki koi baat ni'
 'Main ghar punch gya hun or ab spny laga hun'
 'Nai mje nai mili mail .. mene check ki ti'
 'Yr us din mai pura din bzy rahe vo mujy awne hi nai dy rahe the or kal b aisa hi raha koe naw koe aw raha tha aj to mn soba sy dekh rahe hn k tm aw jaao lkn tm to dada jee ke taraf ..'
 'Lakin wo abhe dar dar ka chalata ha'] [1 1 0 0 0]


# Baseline Model

In [49]:
clf_params_nb = {
    'alpha': 1.0,
    'fit_prior': True,
}
clf_params_lr = {
    'C': 1.0,
    'solver': 'saga',
    'penalty': 'l2',
    'max_iter': 500,
    'random_state': random_state,
}

In [36]:
clf_params_xgb = {
    'n_estimators': 100,
    'max_depth': 6,
    'learning_rate': 0.3,                                 # eta
    'objective': 'multi:softmax',                         # multi:softmax, multi:softprob, rank:pairwise
    'eval_metric': 'merror',                              # multiclass - merror, mlogloss
    'base_score': 0.5,
    'booster': 'gbtree',                                  # gbtree, dart
    'tree_method': 'auto',                                # auto, exact, approx, hist and gpu_hist
    'importance_type': 'gain',                            # default“gain”,“weight”,“cover”,“total_gain”,“total_cover”
    'gamma': 0,                                           # larger - more conservative, [0, inf]
    'reg_alpha': 0,                                       # L1 reg., larger - more conservative
    'reg_lambda': 1,                                      # L2 rreg., larger - more conservative
    'sampling_method': 'uniform',                         # uniform, gradient_based
    'max_delta_step': 1,                                  # 1-10
    'min_child_weight': 1,
    'subsample': 1.0,                                     # 0-1    
    'colsample_bylevel': 1.0,                             # 0-1
    'colsample_bynode': 1.0,                              # optimized for higher recall
    'colsample_bytree': 1.0,                              # 0-1  
    'seed': 2,
    'num_class': 3,
    #'use_label_encoder': False,
    'random_state': random_state,
    'n_jobs': -1,    
}

In [50]:
vect_params = {
    'max_df': 1.0,
    'min_df': 1,
    'analyzer': 'char',
    'ngram_range': (1,7),
    'binary': True,
    'stop_words': stopwords1 + stopwords_en2,
}

In [51]:
vectorizer = TfidfVectorizer( **vect_params )
#vectorizer = CountVectorizer( **vect_params )

#clf = MultinomialNB( **clf_params_nb )
clf = LogisticRegression( **clf_params_lr )
#clf = XGBClassifier( **clf_params_xgb )


model = Pipeline( steps=[('vect', vectorizer), ('clf', clf)] )
model.fit(X_train, y_train)



In [52]:
# make predictions
y_pred_train = model.predict(X_train)
y_pred_dev   = model.predict(X_dev)

In [53]:
# print classification reports
print('Vectorizer:\n', model['vect'], '\n', sep='')
print('Classifier:\n', model['clf'], '\n', sep='')

print('\nTRAINSET')
print( classification_report( y_train, y_pred_train, digits=4 ) )

print('DEVSET')
print( classification_report( y_dev, y_pred_dev, digits=4 ) )

Vectorizer:
TfidfVectorizer(analyzer='char', binary=True, ngram_range=(1, 7),
                stop_words=['ai', 'ayi', 'hy', 'hai', 'main', 'ki', 'tha',
                            'koi', 'ko', 'sy', 'woh', 'bhi', 'aur', 'wo', 'yeh',
                            'rha', 'hota', 'ho', 'ga', 'ka', 'le', 'lye', 'kr',
                            'kar', 'lye', 'liye', 'hotay', 'waisay', 'gya',
                            'gaya', ...])

Classifier:
LogisticRegression(max_iter=500, random_state=47, solver='saga')


TRAINSET
              precision    recall  f1-score   support

           0     0.9865    0.9922    0.9893      2805
           1     0.9868    0.9825    0.9846      2805
           2     0.9839    0.9825    0.9832      2805

    accuracy                         0.9857      8415
   macro avg     0.9857    0.9857    0.9857      8415
weighted avg     0.9857    0.9857    0.9857      8415

DEVSET
              precision    recall  f1-score   support

           0     0.5225    0.5189   

In [368]:
df_dev['clf_pred'] = y_pred_dev

```
LogisticRegression is less overfit

Vectorizer:
TfidfVectorizer(analyzer='char', min_df=2, ngram_range=(1, 7),
                stop_words=stopwords + stopwords_en2)

Classifier:
LogisticRegression(max_iter=500, random_state=47, solver='liblinear')


TRAINSET
              precision    recall  f1-score   support

           0     0.9520    0.9687    0.9603      6268
           1     0.9682    0.9512    0.9596      6268

    accuracy                         0.9600     12536
   macro avg     0.9601    0.9600    0.9600     12536
weighted avg     0.9601    0.9600    0.9600     12536

DEVSET
              precision    recall  f1-score   support

           0     0.7619    0.6598    0.7072       388
           1     0.8456    0.9004    0.8721       803

    accuracy                         0.8220      1191
   macro avg     0.8038    0.7801    0.7897      1191
weighted avg     0.8183    0.8220    0.8184      1191



Vectorizer:
TfidfVectorizer(analyzer='char_wb', ngram_range=(1, 5),
                stop_words=stopwords + stopwords_en2)

Classifier:
XGBClassifier( n_estimators=200 OR max_depth=7, ...)


TRAINSET
              precision    recall  f1-score   support

           0     0.9894    0.9995    0.9944      6268
           1     0.9995    0.9893    0.9944      6268

    accuracy                         0.9944     12536
   macro avg     0.9945    0.9944    0.9944     12536
weighted avg     0.9945    0.9944    0.9944     12536

DEVSET
              precision    recall  f1-score   support

           0     0.7136    0.7191    0.7163       388
           1     0.8638    0.8605    0.8621       803

    accuracy                         0.8144      1191
   macro avg     0.7887    0.7898    0.7892      1191
weighted avg     0.8148    0.8144    0.8146      1191
```

# ChatGPT API: Zero-Shot Classification

In [195]:
prompt_one   = '''The text below may contain words or phrases in Roman Urdu along with English. Translate the text below into English. Then classify the translated text into one most relevant category from the following list of categories: positive, neutral, or negative. The category must depend on the sentiment or emotional content of the text. Classify the text below and output only one most relevant category from the above list of categories. Text: "{}"'''
s = 'This is a text sample'
print(prompt_one.format(s), '\n')

The text below may contain words or phrases in Roman Urdu along with English. Translate the text below into English. Then classify the translated text into one most relevant category from the following list of categories: positive, neutral, or negative. The category must depend on the sentiment or emotional content of the text. Classify the text below and output only one most relevant category from the above list of categories. Text: "This is a text sample" 



In [196]:
# Using followup Q1 can improve the reponse. If the reponse has multiple words, first parse it and try to find
# the category in it. Only if this doesn't work, send followup Q2. ChatGPT can offer the second category in reponse
# to Q1, but can change its mind again and offer a third category if asked Q2
followup1 = 'Are you sure about that?'
followup2 = 'Output only the category and nothing else'

In [198]:
openai.api_key = os.getenv("OPENAI_API_KEY")
model          = 'gpt-3.5-turbo'
labels_set     = {'positive', 'neutral', 'negative'}
clean = re.compile(r'[^a-zA-Z ]+')
multi_spaces = re.compile('\s{2,}')
print(labels_set)

{'neutral', 'negative', 'positive'}


In [199]:
def verify_label(label_):
    '''
       Verify if label_ contains any of the categories
       from the predefined set of labels
    '''
    label_ = clean.sub(' ', label_)
    label_ = multi_spaces.sub(' ', label_).lower().split()
    res    = [i for i in label_ if i in labels_set]
    res    = list(set(res))
    return '/'.join(res) if res else None

In [200]:
def verify_num_tokens(model, messages):
    '''Check that there is enough tokens available for a ChatGPT repsonse'''
    num_tokens_tiktoken = num_tokens_from_messages(messages, model)
    if num_tokens_tiktoken > 3950:
        print(f'Number of tokens is {num_tokens_tiktoken} which exceeds 3950')
        print(f'TEXT: {text_}\n')
        return False
    else:
        return True


def get_response(model, messages, temperature=0, max_tokens=None):
    '''Send request, return reponse'''
    response  = openai.ChatCompletion.create(
        model = model,
        messages = messages,
        temperature = temperature,        # range(0,2), the more the less deterministic / focused
        top_p = 1,                        # top probability mass, e.g. 0.1 = only tokens from top 10% proba mass
        n = 1,                            # number of chat completions
        #max_tokens = max_tokens,          # tokens to return
        stream = False,        
        stop=None,                        # sequence to stop generation (new line, end of text, etc.)
        )
    content = response['choices'][0]['message']['content'].strip()
    #num_tokens_api = response['usage']['prompt_tokens']
    return content

In [217]:
def translate_text(text_, prompt_):
    '''Translate text_ using prompt_ and ChatGPT API'''    
        
    # compose messages and check num_tokens
    messages = [            
            { "role": "system", "content": "You are a smart translator from Roman Urdu into English.", },
            { "role": "user", "content": prompt_.format(text_), },
            ]
    if not verify_num_tokens(model, messages): return None
    return get_response(model, messages)

prompt_translate = '''The entire text below or parts of it can be written in Roman Urdu. Act as a smart Roman Urdu to English translator, and do your best to translate the text below completely into English. Pay special attention to and carefully convey the correct English meaning of any words or phrases that describe sentiment or emotions. Based on the above instructions carefully translate the text below into English. Output only the English translation. Text: {}'''
s = 'This is a text sample'
print(prompt_translate.format(s), '\n')

The entire text below or parts of it can be written in Roman Urdu. Act as a smart Roman Urdu to English translator, and do your best to translate the text below completely into English. Pay special attention to and carefully convey the correct English meaning of any words or phrases that describe sentiment or emotions. Based on the above instructions carefully translate the text below into English. Output only the English translation. Text: This is a text sample 



In [202]:
def classify_text(text_, prompt_):
    '''Classify text_ using prompt_ and ChatGPT API'''
        
    # compose messages and check num_tokens
    messages = [
            #{ "role": "system", "content": "You are a very accurate zero-shot text classifier.", },            
            { "role": "user", "content": prompt_.format(text_), },
            ]
    if not verify_num_tokens(model, messages): return None
    label_ = get_response(model, messages)
        
    # if label > 1 word long OR label has additional characters
    old_label = label_
    label_    = verify_label(label_)
        
    # if label not found in response text - second, extended chat
    if label_ is None:
        messages += [
            { "role": "assistant", "content": old_label, },
            { "role": "user", "content": followup1, }
            ]        
        label_    = get_response(model, messages)        
        old_label = label_
        label_    = verify_label(label_)
            
    return label_ if label_ is not None else old_label

In [203]:
def classify_text2(text_, prompt_):
    '''Classify text_ using prompt_ and ChatGPT API and two questions'''
        
    # compose messages and check num_tokens
    messages = [
            #{ "role": "system", "content": "You are a very accurate zero-shot text classifier.", },            
            { "role": "user", "content": prompt_.format(text_), },
            ]
    if not verify_num_tokens(model, messages): return None
    label_ = get_response(model, messages)
    
        
    # if label > 1 word long OR label has additional characters
    old_label = label_
    label_    = verify_label(label_)
        
    # if label not found in response text - second, extended chat
    if label_ is None:
        messages += [
            { "role": "assistant", "content": old_label, },
            { "role": "user", "content": followup1, }
            ]
        label_    = get_response(model, messages)        
        old_label = label_
        label_    = verify_label(label_)
            
    return label_ if label_ is not None else old_label

In [218]:
# test as single prompt
idx = 11
text, groundtruth_labels = df_dev[['text_clean', 'emotion']].values[idx]
#label = classify_text(text, prompt_one)
translated = translate_text(text, prompt_translate)

print(prompt_one.format( text ))
print(f"\nGROUNDTRUTH LABEL:\n{'/'.join( groundtruth_labels )}")
print(f"\nPREDICTED LABEL:\n{translated}")
#print(f'\nTOTAL TOKENS: {tokens}')

The text below may contain words or phrases in Roman Urdu along with English. Translate the text below into English. Then classify the translated text into one most relevant category from the following list of categories: positive, neutral, or negative. The category must depend on the sentiment or emotional content of the text. Classify the text below and output only one most relevant category from the above list of categories. Text: "Yar phely kab mana kia ha tm lOgo kO yr ajaO wsy b abi free hO"

GROUNDTRUTH LABEL:
n/e/u/t/r/a/l

PREDICTED LABEL:
"Yaar pehle kab mana kia hai tum logon ko yaar ajaao. Waise bhi abhi free ho."

Translation: "Friend, when did I ever refuse you guys to come over? Come on over, I'm free right now anyway."


In [219]:
# prompt 1 tqdm results - 1191/1191 [25:38<00:00, 1.18s/it]
df_dev['gpt_translated2'] = df_dev['text_clean'].progress_apply( lambda x: translate_text(x, prompt_translate) )

  0%|          | 0/1191 [00:00<?, ?it/s]

In [223]:
def strip_apostr(s):
    if s.startswith('"'):
        s = s[1:]
    if s.endswith('"'):
        s = s[:-1]
    return s


df_dev['gpt_translated2'] = df_dev['gpt_translated2'].apply( strip_apostr )

In [225]:
#file = 'data/mcec_dev2.xlsx'
#df_dev.to_excel(file, index=False, encoding='utf-8')

In [259]:
#file2 = 'data/mcec_dev_translated.pkl'
#df_dev.to_pickle( file2 )

In [208]:
df_dev['gpt_pred'].value_counts()

neutral     1019
negative      88
positive      84
Name: gpt_pred, dtype: int64

In [209]:
df_dev.isna().sum()

text              0
emotion           0
target            0
gtp_translated    0
translated_hi     0
translated_ur     0
text_clean        0
gpt_pred          0
dtype: int64

In [211]:
def convert_pred(pred):
    '''Convert text prediction into number'''
    if pred=='negative':
        return 0
    elif pred=='neutral':
        return 1
    elif pred=='positive':
        return 2
    else:
        return None

df_dev['gpt_pred_num'] = df_dev['gpt_pred'].apply( convert_pred )
print(df_dev.isna().sum())
df_dev['gpt_pred_num'].value_counts()

text              0
emotion           0
target            0
gtp_translated    0
translated_hi     0
translated_ur     0
text_clean        0
gpt_pred          0
gpt_pred_num      0
dtype: int64


1    1019
0      88
2      84
Name: gpt_pred_num, dtype: int64

In [129]:
# if ChatGPT made no prediction, choose the prediction coming from the classifier
def improve_predictions(row):
    if row['gpt_pred_num'] is None:
        row['gpt_pred_binary'] = row['clf_pred']
    return row

#df_dev = df_dev.apply( improve_predictions, axis=1 )

In [212]:
y_dev      = df_dev['target'].values
y_dev_pred = df_dev['gpt_pred_num'].values
print( classification_report( y_dev, y_dev_pred, digits=4 ) )

              precision    recall  f1-score   support

           0     0.6818    0.2062    0.3166       291
           1     0.3582    0.9407    0.5188       388
           2     0.7738    0.1270    0.2181       512

    accuracy                         0.4114      1191
   macro avg     0.6046    0.4246    0.3512      1191
weighted avg     0.6159    0.4114    0.3402      1191



## APPENDIX

### Prompts and results

_Prompt_: The text below may contain words or phrases in Roman Urdu along with English. Translate the text below into English. Then classify the translated text into one most relevant category from the following list of categories: positive, neutral, or negative. The category must depend on the sentiment or emotional content of the text. Classify the text below and output only one most relevant category from the above list of categories. Text: "This is a text sample"
```
              precision    recall  f1-score   support

           0     0.6818    0.2062    0.3166       291
           1     0.3582    0.9407    0.5188       388
           2     0.7738    0.1270    0.2181       512

    accuracy                         0.4114      1191
   macro avg     0.6046    0.4246    0.3512      1191
weighted avg     0.6159    0.4114    0.3402      1191
```

_Prompt used for gpt_translated2 column translation_: The entire text below or parts of it can be written in Roman Urdu. Act as a smart Roman Urdu to English translator, and do your best to translate the text below completely into English. Pay special attention to and carefully convey the correct English meaning of any words or phrases that describe sentiment or emotions. Based on the above instructions carefully translate the text below into English. Output only the English translation. Text: Yar phely kab mana kia ha tm lOgo kO yr ajaO wsy b abi free hO

GROUNDTRUTH LABEL:
n/e/u/t/r/a/l

PREDICTED LABEL:
"Yaar pehle kab mana kia hai tum logon ko yaar ajaao. Waise bhi abhi free ho."

Translation: "Friend, when did I ever refuse you guys to come over? Come on over, I'm free right now anyway."

_Conclusion about the prompt_: this is not an efficient prompt. Sometimes it outputs extraneous text (see above) that needs additional cleaning. Sometimes, ChatGPT says that it can't translate because it doesn't know (because of "smart" or "do your best" - perfectionism?) while the first translation prompt did produce a translation. Also, ChatGPT is not always clear about the direction of translation (because of "Roman Urdu to English translator"). ChatGPT also is making comments like: "Sorry, I cannot translate because this is a form of slang that is not appropriate for professional or polite conversation" - seriously? When the text is already in English, it may say: Sorry, I cannot translate Roman Urdu into English without the Roman Urdu text.

### Best HPs

```
clf_params_lr = {
    'C': 1.0,
    'solver': 'saga',
    'penalty': 'l2',
    'max_iter': 500,
    'random_state': random_state,
}
vect_params = {
    'max_df': 1.0,
    'min_df': 1,
    'analyzer': 'char',
    'ngram_range': (1,7),
    'binary': True,
    'stop_words': stopwords1 + stopwords_en2,
}

Vectorizer:
TfidfVectorizer(analyzer='char', binary=True, ngram_range=(1, 7),
                stop_words=['ai', 'ayi', 'hy', 'hai', 'main', 'ki', 'tha',
                            'koi', 'ko', 'sy', 'woh', 'bhi', 'aur', 'wo', 'yeh',
                            'rha', 'hota', 'ho', 'ga', 'ka', 'le', 'lye', 'kr',
                            'kar', 'lye', 'liye', 'hotay', 'waisay', 'gya',
                            'gaya', ...])

Classifier:
LogisticRegression(max_iter=500, random_state=47, solver='saga')


TRAINSET
              precision    recall  f1-score   support

           0     0.9865    0.9922    0.9893      2805
           1     0.9868    0.9825    0.9846      2805
           2     0.9839    0.9825    0.9832      2805

    accuracy                         0.9857      8415
   macro avg     0.9857    0.9857    0.9857      8415
weighted avg     0.9857    0.9857    0.9857      8415

DEVSET
              precision    recall  f1-score   support

           0     0.5225    0.5189    0.5207       291
           1     0.5235    0.4871    0.5047       388
           2     0.6451    0.6816    0.6629       512

    accuracy                         0.5785      1191
   macro avg     0.5637    0.5626    0.5627      1191
weighted avg     0.5755    0.5785    0.5766      1191
```

-----

```
Vectorizer:
TfidfVectorizer(analyzer='char_wb', binary=True, min_df=2, ngram_range=(1, 5),
                stop_words=stopwords1 + stopwords_en2)

Classifier:
XGBClassifier(learning_rate=0.3, n_estimators=100, max_depth=6, ...)


TRAINSET
              precision    recall  f1-score   support

           0     0.9818    0.9811    0.9815      2805
           1     0.9386    0.9857    0.9616      2805
           2     0.9861    0.9373    0.9611      2805

    accuracy                         0.9680      8415
   macro avg     0.9688    0.9680    0.9680      8415
weighted avg     0.9688    0.9680    0.9680      8415

DEVSET
              precision    recall  f1-score   support

           0     0.5205    0.4364    0.4748       291
           1     0.5116    0.6263    0.5632       388
           2     0.6716    0.6191    0.6443       512

    accuracy                         0.5768      1191
   macro avg     0.5679    0.5606    0.5607      1191
weighted avg     0.5826    0.5768    0.5764      1191
```