# WASSA 2023 Shared Task on Multi-Label and Multi-Class Emotion Classification on Code-Mixed Text Messages

In [43]:
import openai
import numpy as np
import pandas as pd
import sklearn
import re, os
import time
import zipfile
from typing import List
from copy import deepcopy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, RidgeClassifier, RidgeClassifierCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, multilabel_confusion_matrix
from tqdm.autonotebook import tqdm
import tiktoken
tqdm.pandas()

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 400)
#os.path.join()

In [2]:
# new new version (Dec 2022)
def upsample_all( df_, labels_col='target', random_state=47 ):
    '''
        Upsample each class in column labels_col of pandas dataframe df_
        to the number of data points in majority class
    '''
    # get sub-dataframes for each class & max length
    labels = df_[labels_col].unique()
    dframes, df_lengths = dict(), dict()
    for i in labels:
        temp          = df_[ df_[labels_col] == i ]
        dframes[i]    = temp.copy()
        df_lengths[i] = len(temp)

    max_len = max( list(df_lengths.values()) )
    df_lengths = {k: max_len-v for k,v in df_lengths.items()}                     # difference - how many to resample

    # upsample with replacement to max length
    for i in labels:
        if df_lengths[i] == max_len:
            dframes[i] = dframes[i].sample( frac=1, random_state=random_state )      # we know it's overrepresented
        else:
            if len(dframes[i]) >= df_lengths[i]:
                replace = False                                                      # enough data points
            else:
                replace = True
            temp = dframes[i].sample( df_lengths[i], replace=replace, random_state=random_state )
            dframes[i] = pd.concat( [dframes[i].copy(), temp.copy()] )               # df len + (max_len-df len)
            dframes[i] = dframes[i].sample( frac=1, random_state=random_state )      # shuffle

    # combine and reshuffle
    df_merged = pd.concat( list(dframes.values()) )
    df_merged = df_merged.sample( frac=1, random_state=random_state ).reset_index(drop=True)

    return df_merged

In [3]:
# in the order of decreasing frequency
label2key = {
    'neutral': 0,
    'joy': 1,
    'trust': 2,
    'disgust': 3,
    'optimism': 4,
    'anticipation': 5,
    'sadness': 6,
    'fear': 7,
    'surprise': 8,
    'anger': 9,
    'pessimism': 10,
    'love':  11,
}
key2label = { v: k for k,v in label2key.items()}
key2label

{0: 'neutral',
 1: 'joy',
 2: 'trust',
 3: 'disgust',
 4: 'optimism',
 5: 'anticipation',
 6: 'sadness',
 7: 'fear',
 8: 'surprise',
 9: 'anger',
 10: 'pessimism',
 11: 'love'}

In [4]:
random_state = 47

# Load and Prepare Data

In [5]:
file1    = 'data/mcec_train.csv'
df_train = pd.read_csv(file1)

file2    = 'data/mcec_dev.csv'
df_dev   = pd.read_csv(file2)

print(df_train.shape, df_dev.shape)

(9530, 2) (1191, 2)


In [6]:
print(df_train['Emotion'].value_counts())
df_train.head()

neutral         3262
trust           1118
joy             1022
optimism         880
anticipation     832
disgust          687
sadness          486
fear             453
anger            226
surprise         199
love             187
pessimism        178
Name: Emotion, dtype: int64


Unnamed: 0,Text,Emotion
0,Yes.I am in fyp lab cabin.but fyp presentation...,neutral
1,Yar insan ka bcha bn chawliyn na mar :p,joy
2,Terai uncle nai kahna hai kai ham nai to bahr ...,disgust
3,Yr ajao I m cming in the club,neutral
4,Mje wese Nimra ahmad ka Qur'aan ki aayaat k ba...,joy


In [7]:
df_train['target'] = df_train['Emotion'].map( label2key )
df_dev['target']   = df_dev['Emotion'].map( label2key )

In [8]:
print('\nTrain set b4 upsampling:\n', df_train['Emotion'].value_counts(), sep='')
df_train = upsample_all( df_train.copy(), labels_col='target', random_state=random_state )
print('\nTrain set after upsampling:\n', df_train['Emotion'].value_counts(), '\n\n', 
       df_train['target'].value_counts(), sep='')


Train set b4 upsampling:
neutral         3262
trust           1118
joy             1022
optimism         880
anticipation     832
disgust          687
sadness          486
fear             453
anger            226
surprise         199
love             187
pessimism        178
Name: Emotion, dtype: int64

Train set after upsampling:
fear            3262
pessimism       3262
anger           3262
joy             3262
anticipation    3262
neutral         3262
love            3262
surprise        3262
optimism        3262
disgust         3262
trust           3262
sadness         3262
Name: Emotion, dtype: int64

7     3262
10    3262
9     3262
1     3262
5     3262
0     3262
11    3262
8     3262
4     3262
3     3262
2     3262
6     3262
Name: target, dtype: int64


In [9]:
X_train = df_train['Text'].values
y_train = df_train['target'].values

X_dev = df_dev['Text'].values
y_dev = df_dev['target'].values

X_train, y_train = sklearn.utils.shuffle( X_train, y_train, random_state=random_state, ) 
print( 'Shape of datasets: ', X_train.shape, y_train.shape, X_dev.shape, y_dev.shape, )

Shape of datasets:  (39144,) (39144,) (1191,) (1191,)


# Strong Baseline Model

In [10]:
clf_params_svm = {
    
    'C': 1.0,                      # default=1.0
    'kernel': 'rbf',               # default=’rbf’, {‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’}
    'degree': 3,                   # default=3, degree for polynomial f(x)
    'tol': 1e-3,                   # stopping criteria, default=1e-3
    'gamma': 'scale',               # default=’scale’, kernel coeff for ‘rbf’, ‘poly’ and ‘sigmoid’
                                   # 'scale' => 1 / (n_features * X.var()), ‘auto’ => 1 / n_features
    'coef0': 0.0,                  # default=0.0, independent term in kernel function in ‘poly’ and ‘sigmoid’
    'shrinking': True,             # default=True'
    'cache_size': 200,             # default=200,   size of the kernel cache (in MB)
    'decision_function_shape': 'ovr',    # default=’ovr’, {‘ovo’, ‘ovr’}, multiclass => always 'ovo'
    'break_ties': False,           # default=False, for decision_function_shape='ovr' and num classes>2 (longer)
    'max_iter': -1,                # default=-1,    limit on iterations
    'class_weight': 'balanced',          # default=None,  dict or ‘balanced'
    'probability': True,
    'verbose': 0,
    'random_state': random_state,
}

In [11]:
clf_params_xgb = {
    'n_estimators': 100,
    'max_depth': 6,
    'learning_rate': 0.3,                                 # eta
    'objective': 'multi:softmax',                         # multi:softmax, multi:softprob, rank:pairwise
    'eval_metric': 'merror',                              # multiclass - merror, mlogloss
    'base_score': 0.5,
    'booster': 'gbtree',                                  # gbtree, dart
    'tree_method': 'auto',                                # auto, exact, approx, hist and gpu_hist
    'importance_type': 'gain',                            # default“gain”,“weight”,“cover”,“total_gain”,“total_cover”
    'gamma': 0,                                           # larger - more conservative, [0, inf]
    'reg_alpha': 0,                                       # L1 reg., larger - more conservative
    'reg_lambda': 1,                                      # L2 rreg., larger - more conservative
    'sampling_method': 'uniform',                         # uniform, gradient_based
    'max_delta_step': 1,                                  # 1-10
    'min_child_weight': 1,
    'subsample': 1.0,                                     # 0-1    
    'colsample_bylevel': 1.0,                             # 0-1
    'colsample_bynode': 1.0,                              # optimized for higher recall
    'colsample_bytree': 1.0,                              # 0-1  
    'seed': 2,
    'num_class': 12,
    #'use_label_encoder': False,
    'random_state': random_state,
    'n_jobs': -1,    
}

In [55]:
vect_params = {
    'max_df': 1.0,
    'min_df': 1,
    'analyzer': 'char',
    'ngram_range': (1,5),
    'binary': True,
    'stop_words': 'english',
}

In [56]:
vectorizer = TfidfVectorizer( **vect_params )
#vectorizer = CountVectorizer( **vect_params )

#clf = SVC( **clf_params_svm )
#clf = XGBClassifier( **clf_params_xgb )
clf = LogisticRegression(max_iter=500)
#clf = MultinomialNB()

model = Pipeline( steps=[('vect', vectorizer), ('clf', clf)] )
model.fit(X_train, y_train)



In [57]:
# make predictions
y_pred_train = model.predict(X_train)
y_pred_dev   = model.predict(X_dev)

# add prediction to dataframe
df_dev['clf_pred'] = y_pred_dev
df_dev['clf_pred_emotion'] = df_dev['clf_pred'].map( key2label )

# labels for classification report
labels = list(label2key.keys())
print('Labels:', labels)

Labels: ['neutral', 'joy', 'trust', 'disgust', 'optimism', 'anticipation', 'sadness', 'fear', 'surprise', 'anger', 'pessimism', 'love']


In [58]:
# print classification reports
print('Vectorizer:\n', model['vect'], '\n', sep='')
print('Classifier:\n', model['clf'], '\n', sep='')

print('\nTRAINSET')
print( classification_report( y_train, y_pred_train, target_names=labels, digits=4 ) )

print('DEVSET')
print( classification_report( y_dev, y_pred_dev, target_names=labels, digits=4 ) )

Vectorizer:
TfidfVectorizer(analyzer='char', binary=True, ngram_range=(1, 5),
                stop_words='english')

Classifier:
LogisticRegression(max_iter=500)


TRAINSET
              precision    recall  f1-score   support

     neutral     0.9813    0.9307    0.9553      3262
         joy     0.9845    0.9905    0.9875      3262
       trust     0.9791    0.9752    0.9771      3262
     disgust     0.9887    0.9960    0.9924      3262
    optimism     0.9705    0.9880    0.9792      3262
anticipation     0.9790    0.9865    0.9827      3262
     sadness     0.9945    0.9994    0.9969      3262
        fear     0.9939    0.9988    0.9963      3262
    surprise     0.9988    1.0000    0.9994      3262
       anger     0.9997    1.0000    0.9998      3262
   pessimism     0.9982    1.0000    0.9991      3262
        love     0.9969    1.0000    0.9985      3262

    accuracy                         0.9888     39144
   macro avg     0.9888    0.9888    0.9887     39144
weighted avg   

In [389]:
# create zip file for submission
file     = 'data/results.csv'
zip_file = 'data/results.zip'

with open(file, 'w', encoding='utf-8') as f:
    f.write( '\n'.join(df_dev['pred_emotion'].tolist()) )

with zipfile.ZipFile(zip_file, "w", compression=zipfile.ZIP_STORED) as zf:        # , compression=zipfile.ZIP_DEFLATED
    zf.write( file )

## APPENDIX

In [1805]:
# BEST THRESHOLD - DEV SET (IRRELEVANT)
res_dev = []
for i in range(1,100):
    threshold = i/100
    y_pred_dev_encoded = convert_preds(y_pred_dev_probas, threshold=threshold)
    clf_rep_dev = classification_report( y_dev_encoded, y_pred_dev_encoded, target_names=labels, output_dict=True )
    res_dev.append([ clf_rep_dev['macro avg']['f1-score'],  clf_rep_dev['micro avg']['f1-score'],
                     clf_rep_dev['macro avg']['precision'],  clf_rep_dev['micro avg']['precision'],
                     clf_rep_dev['macro avg']['recall'],  clf_rep_dev['micro avg']['recall'], threshold
                   ])

In [None]:
sorted( res_dev, key=lambda x: x[0], reverse=True )[:10]

## All results below are on the dataset that was not upsampled

### Log of some best results

__XGBoost__ (micro, macro F1 score)

__analyzer='char'__:
* tfidf, binary=False, ngram(1,5), min_df=7, 'english': __0.7053, 0.6758__ (0.9792, 0.9866)    # more overfit   # F1 micro = 0.71, but F1 macro = 0.66 when min_df=8
* increasing num estimators to 150 improves F1 macro to 0.6773, but trainin F1 becomes (0.9973, 0.9983) 
* count, binary=False, ngram(1,5), min_df=1, 'english': 0.6683, 0.6342 (0.9338, 0.9528)

__analyzer='word'__:
* tfidf, binary=False, ngram(1,1), min_df=1, 'english': 0.5189, 0.4503
* tfidf, binary=False, ngram(1,1), min_df=1, None: 0.5281, 0.4528
* tfidf - increasing word ngram_range has no effect
* tfidf, binary=False, ngram(1,1), min_df=5, 'english': 0.5323, 0.4680
* count - much worse: 0.4794, 0.3641

Changing HPs for XGBClassifier didn't have any effect

__SVC__ results w/out ANY HP fine-tuning (using the best features from XGB) - 0.6952, 0.6678 (training 0.9562, 0.9731!). Least overfit

__RandomForest__'s results w/out ANY HP fine-tuning were close (0.68, 0.65), but the training F1 was 0.9997 both. Overfit 

In [None]:
# best XGB
clf_params_xgb_word = {
    'n_estimators': 100,
    'max_depth': 6,
    'learning_rate': 0.3,                                 # eta
    'objective': 'multi:softmax',                         # multi:softmax, multi:softprob, rank:pairwise
    'eval_metric': 'merror',                              # multiclass - merror, mlogloss
    'base_score': 0.5,
    'booster': 'gbtree',                                  # gbtree, dart
    'tree_method': 'auto',                                # auto, exact, approx, hist and gpu_hist
    'importance_type': 'gain',                            # default“gain”,“weight”,“cover”,“total_gain”,“total_cover”
    'gamma': 0,                                           # larger - more conservative, [0, inf]
    'reg_alpha': 0,                                       # L1 reg., larger - more conservative
    'reg_lambda': 1,                                      # L2 rreg., larger - more conservative
    'sampling_method': 'uniform',                         # uniform, gradient_based
    'max_delta_step': 1,                                  # 1-10
    'min_child_weight': 1,
    'subsample': 1.0,                                     # 0-1    
    'colsample_bylevel': 1.0,                             # 0-1
    'colsample_bynode': 1.0,                              # optimized for higher recall
    'colsample_bytree': 1.0,                              # 0-1  
    'seed': 2,
    'num_class': 12,
    #'use_label_encoder': False,
    'random_state': random_state, # 47
    'n_jobs': -1,    
}

vect_params = {
    'max_df': 1.0,
    'min_df': 7,
    'analyzer': 'char',
    'ngram_range': (1,5),
    'binary': False,
    'stop_words': 'english',
}

TfidfVectorizer()

### Best XGBClassifier results
Note: no grid search or cross-validation - the results of this classifier were not intended for official submission.
```
Vectorizer:
TfidfVectorizer(analyzer='char', min_df=7, ngram_range=(1, 5),
                stop_words='english')

Classifier:
XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1.0, colsample_bynode=1.0, colsample_bytree=1.0,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric='merror', feature_types=None, gamma=0, gpu_id=None,
              grow_policy=None, importance_type='gain',
              interaction_constraints=None, learning_rate=0.3, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=1,
              max_depth=6, max_leaves=None, min_child_weight=1, missing=nan,
              monotone_constraints=None, n_estimators=100, n_jobs=-1,
              num_class=12, num_parallel_tree=None, objective='multi:softmax', ...)


TRAINSET
              precision    recall  f1-score   support

     neutral     0.9471    0.9985    0.9721      3262
         joy     0.9921    0.9883    0.9902      1022
       trust     0.9963    0.9589    0.9772      1118
     disgust     1.0000    0.9403    0.9692       687
    optimism     0.9953    0.9727    0.9839       880
anticipation     1.0000    0.9387    0.9684       832
     sadness     1.0000    0.9897    0.9948       486
        fear     1.0000    0.9691    0.9843       453
    surprise     1.0000    1.0000    1.0000       199
       anger     1.0000    1.0000    1.0000       226
   pessimism     1.0000    1.0000    1.0000       178
        love     1.0000    1.0000    1.0000       187

    accuracy                         0.9792      9530
   macro avg     0.9942    0.9797    0.9867      9530
weighted avg     0.9802    0.9792    0.9792      9530

DEVSET
              precision    recall  f1-score   support

     neutral     0.5908    0.9227    0.7203       388
         joy     0.8450    0.8321    0.8385       131
       trust     0.7895    0.6000    0.6818       125
     disgust     0.8133    0.5398    0.6489       113
    optimism     0.8193    0.6182    0.7047       110
anticipation     0.6912    0.5000    0.5802        94
     sadness     0.8696    0.6452    0.7407        62
        fear     0.9000    0.5192    0.6585        52
    surprise     0.8500    0.4857    0.6182        35
       anger     1.0000    0.4857    0.6538        35
   pessimism     1.0000    0.3793    0.5500        29
        love     0.9091    0.5882    0.7143        17

    accuracy                         0.7053      1191
   macro avg     0.8398    0.5930    0.6758      1191
weighted avg     0.7519    0.7053    0.6992      1191
```

## With Upsampling
* Upsampled XGB - similar / slightly less than non-upsampled XGB
* Upsampled LogisticRegression - close to XGB