# Copy labels leaked from training set. Classify the rest w/baseline classifier

## The Association for Computational Linguistics
## WASSA 2023 Shared Task on Multi-Label and Multi-Class Emotion Classification on Code-Mixed Text Messages
See more details [here](https://codalab.lisn.upsaclay.fr/competitions/10864#learn_the_details)

In [1]:
import openai
import numpy as np
import pandas as pd
import sklearn
import re, os
import time
import zipfile
from typing import List
from copy import deepcopy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, RidgeClassifier, RidgeClassifierCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, multilabel_confusion_matrix
from tqdm.autonotebook import tqdm
import tiktoken
tqdm.pandas()

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 400)
#os.path.join()

  from tqdm.autonotebook import tqdm


In [2]:
# new new version (Dec 2022)
def upsample_all( df_, labels_col='target', random_state=47 ):
    '''
        Upsample each class in column labels_col of pandas dataframe df_
        to the number of data points in majority class
    '''
    # get sub-dataframes for each class & max length
    labels = df_[labels_col].unique()
    dframes, df_lengths = dict(), dict()
    for i in labels:
        temp          = df_[ df_[labels_col] == i ]
        dframes[i]    = temp.copy()
        df_lengths[i] = len(temp)

    max_len = max( list(df_lengths.values()) )
    df_lengths = {k: max_len-v for k,v in df_lengths.items()}                     # difference - how many to resample

    # upsample with replacement to max length
    for i in labels:
        if df_lengths[i] == max_len:
            dframes[i] = dframes[i].sample( frac=1, random_state=random_state )      # we know it's overrepresented
        else:
            if len(dframes[i]) >= df_lengths[i]:
                replace = False                                                      # enough data points
            else:
                replace = True
            temp = dframes[i].sample( df_lengths[i], replace=replace, random_state=random_state )
            dframes[i] = pd.concat( [dframes[i].copy(), temp.copy()] )               # df len + (max_len-df len)
            dframes[i] = dframes[i].sample( frac=1, random_state=random_state )      # shuffle

    # combine and reshuffle
    df_merged = pd.concat( list(dframes.values()) )
    df_merged = df_merged.sample( frac=1, random_state=random_state ).reset_index(drop=True)

    return df_merged

In [3]:
# in the order of decreasing frequency
label2key = {
    'neutral': 0,
    'joy': 1,
    'trust': 2,
    'disgust': 3,
    'optimism': 4,
    'anticipation': 5,
    'sadness': 6,
    'fear': 7,
    'surprise': 8,
    'anger': 9,
    'pessimism': 10,
    'love':  11,
}
key2label = { v: k for k,v in label2key.items()}
key2label

{0: 'neutral',
 1: 'joy',
 2: 'trust',
 3: 'disgust',
 4: 'optimism',
 5: 'anticipation',
 6: 'sadness',
 7: 'fear',
 8: 'surprise',
 9: 'anger',
 10: 'pessimism',
 11: 'love'}

In [4]:
random_state = 47

# Load and Prepare Data

In [5]:
file1    = 'data/mcec_train.csv'
df_train = pd.read_csv(file1)

file2    = 'data/mcec_dev.csv'
df_dev   = pd.read_csv(file2)

file3    = 'data/mcec_test.csv'
df_test  = pd.read_csv(file3)

print(df_train.shape, df_dev.shape, df_test.shape)

(9530, 2) (1191, 2) (1191, 1)


In [7]:
print(df_train['Emotion'].value_counts())
df_train.head()

neutral         3262
trust           1118
joy             1022
optimism         880
anticipation     832
disgust          687
sadness          486
fear             453
anger            226
surprise         199
love             187
pessimism        178
Name: Emotion, dtype: int64


Unnamed: 0,Text,Emotion
0,Yes.I am in fyp lab cabin.but fyp presentation...,neutral
1,Yar insan ka bcha bn chawliyn na mar :p,joy
2,Terai uncle nai kahna hai kai ham nai to bahr ...,disgust
3,Yr ajao I m cming in the club,neutral
4,Mje wese Nimra ahmad ka Qur'aan ki aayaat k ba...,joy


In [8]:
df_train['target'] = df_train['Emotion'].map( label2key )
df_dev['target']   = df_dev['Emotion'].map( label2key )

In [9]:
# light text cleaning (should I use clean regex for better accuracy?)
pad_punct    = re.compile('([^a-zA-Z ]+)')
multi_spaces = re.compile('\s{2,}')
#clean        = re.compile('[^a-zA-Z0-9,.?!\'\s]+')

def clean_text(s):
    s = s.replace('\n', ' ')
    s = pad_punct.sub(r' \1 ', s)
    #s = clean.sub(' ', s)
    s = multi_spaces.sub(' ', s)
    return s.strip()

df_train['text_clean'] = df_train['Text'].apply( clean_text )
df_dev['text_clean']   = df_dev['Text'].apply( clean_text )
df_test['text_clean']  = df_test['Text'].apply( clean_text )

In [10]:
# remove overlap with validation sets
val_sets = df_dev['text_clean'].tolist() + df_test['text_clean'].tolist()
print(len(val_sets), len(set(val_sets)))

print(df_train.shape)
df_train = df_train[ ~df_train['text_clean'].isin(val_sets) ]
print(df_train.shape)

2382 2206
(9530, 4)
(8151, 4)


In [11]:
# remove duplicates from train set
df_train = df_train.drop_duplicates(subset=['text_clean', 'Emotion'])
print(df_train.shape)

(6167, 4)


In [12]:
# is additional text cleaning necessary? I don't see why
from collections import Counter
train_words = ' '.join( df_train['text_clean'].tolist() ).lower().split()
c = Counter( train_words )
c.most_common(350)

[('.', 2127),
 ('k', 1244),
 ('to', 1231),
 ('ha', 1214),
 ('hai', 804),
 ('ho', 793),
 ('ka', 726),
 ('me', 640),
 ('?', 615),
 ('b', 604),
 ('kr', 568),
 ('ga', 559),
 ('ni', 553),
 ('ko', 543),
 ('ki', 532),
 ('tha', 528),
 (',', 518),
 ('...', 502),
 ('na', 497),
 ('hn', 473),
 ('hy', 464),
 ('wo', 461),
 ('ma', 453),
 ('nai', 450),
 ('..', 450),
 ('a', 446),
 ('se', 415),
 ('p', 409),
 ('yar', 401),
 ('or', 392),
 ('yr', 389),
 ('h', 388),
 ('i', 385),
 ('han', 385),
 ('tu', 371),
 ('e', 331),
 (':', 327),
 ('ne', 324),
 ('kia', 321),
 ('he', 287),
 ('hain', 284),
 ('main', 281),
 ('ab', 254),
 ('koi', 252),
 ('us', 251),
 ('nae', 250),
 ('ap', 250),
 ('sir', 250),
 ('sy', 248),
 ('tm', 237),
 ('is', 223),
 ('nahi', 223),
 ('hi', 222),
 ('raha', 220),
 ('kal', 218),
 ('rha', 214),
 ('ja', 202),
 ('ny', 200),
 ('aj', 199),
 ('g', 199),
 ('m', 198),
 ('phr', 195),
 (':-', 193),
 ('aur', 192),
 ('mai', 192),
 ('....', 187),
 ('gya', 184),
 ('d', 183),
 ('bht', 181),
 ('u', 173),
 ('p

In [14]:
# https://www.kaggle.com/code/owaisraza009/roman-urdu-sentiment-analysis/notebook
stopwords1 = [ 'ai', 'ayi', 'hy', 'hai', 'main', 'ki', 'tha', 'koi', 'ko', 'sy', 'woh', 'bhi', 'aur', 'wo', 'yeh',
               'rha', 'hota', 'ho', 'ga', 'ka', 'le', 'lye', 'kr', 'kar', 'lye', 'liye', 'hotay', 'waisay', 'gya',
               'gaya', 'kch', 'ab', 'thy', 'thay', 'houn', 'hain', 'han', 'to', 'is', 'hi', 'jo', 'kya', 'thi', 'se',
               'pe', 'phr', 'wala', 'waisay', 'us', 'na', 'ny', 'hun', 'rha', 'raha', 'ja', 'rahay', 'abi', 'uski',
               'ne', 'haan', 'acha', 'nai', 'sent', 'photo', 'you', 'kafi', 'gai', 'rhy', 'kuch', 'jata', 'aye', 'ya',
               'dono', 'hoa', 'aese', 'de', 'wohi', 'jati', 'jb', 'krta', 'lg', 'rahi', 'hui', 'karna', 'krna', 'gi',
               'hova', 'yehi', 'jana', 'jye', 'chal', 'mil', 'tu', 'hum', 'par', 'hay', 'kis', 'sb', 'gy', 'dain',
               'krny', 'tou', ]

# https://github.com/haseebelahi/roman-urdu-stopwords.git
file = 'data/stopwords.txt'
stopwords2 = open(file).read().split()
print(stopwords2 == stopwords1)

from sklearn.feature_extraction import _stop_words
stopwords_en  = _stop_words.ENGLISH_STOP_WORDS
# selected from stopwords_en
stopwords_en2 = [ 'a', 'about', 'also', 'am', 'an', 'and', 'are', 'as', 'at', 'be', 
                  'been', 'being', 'by', 'co', 'con', 'de', 'eg', 'eight', 'eleven', 'else', 'etc', 
                  'fifteen', 'fifty', 'five', 'for', 'forty', 'four', 'from', 'had',
                  'has', 'hasnt', 'have', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 
                  'his', 'how', 'i', 'ie', 'if', 'in', 'inc', 'into', 'is', 'it', 'its', 'itself',
                  'ltd', 'me', 'mine', 'my', 'myself', 'nine', 'no', 'now', 'of', 'off', 'on',
                  'once', 'one', 'onto', 'or', 'other', 'others', 'our', 'ours', 'ourselves',
                  'out', 'part', 'per', 're', 'several', 'she', 'side', 'since', 'six', 'sixty',
                  'so', 'ten', 'than', 'that', 'the', 'their', 'them',
                  'themselves', 'then', 'there', 'these', 'they', 'thick', 'thin', 'third', 'this', 'those', 
                  'three', 'to', 'twelve', 'twenty', 'two', 'un','us', 'very',
                  'via', 'was', 'we', 'were', 'what', 'when', 'where', 'whether', 'which', 'while', 
                  'who', 'whom', 'whose', 'why', 'with', 'within', 'would', 'yet', 'you', 'your', 'yours',
                   'yourself', 'yourselves', ]

print( len(stopwords1), len(stopwords_en), len(stopwords_en2), )

True
102 318 129


In [15]:
print('\nTrain set b4 upsampling:\n', df_train['Emotion'].value_counts(), sep='')
df_train = upsample_all( df_train.copy(), labels_col='target', random_state=random_state )
print('\nTrain set after upsampling:\n', df_train['Emotion'].value_counts(), '\n\n', 
       df_train['target'].value_counts(), sep='')


Train set b4 upsampling:
neutral         2097
joy              722
trust            684
optimism         577
anticipation     562
disgust          441
sadness          316
fear             257
anger            142
surprise         136
love             124
pessimism        109
Name: Emotion, dtype: int64

Train set after upsampling:
fear            2097
pessimism       2097
anticipation    2097
optimism        2097
sadness         2097
anger           2097
love            2097
trust           2097
surprise        2097
neutral         2097
joy             2097
disgust         2097
Name: Emotion, dtype: int64

7     2097
10    2097
5     2097
4     2097
6     2097
9     2097
11    2097
2     2097
8     2097
0     2097
1     2097
3     2097
Name: target, dtype: int64


In [16]:
X_train = df_train['Text'].values
y_train = df_train['target'].values

X_dev = df_dev['Text'].values
y_dev = df_dev['target'].values

X_train, y_train = sklearn.utils.shuffle( X_train, y_train, random_state=random_state, ) 
print( 'Shape of datasets: ', X_train.shape, y_train.shape, X_dev.shape, y_dev.shape, )

Shape of datasets:  (25164,) (25164,) (1191,) (1191,)


# Baseline Model

In [147]:
clf_params_xgb = {
    'n_estimators': 100,
    'max_depth': 6,                 # maybe 4 w/gamma=0.55 (train accu=0.9 vs. 0.97 for depth 6)
    'learning_rate': 0.3,                                 # eta  
    'objective': 'multi:softmax',                         # multi:softmax, multi:softprob, rank:pairwise
    'eval_metric': 'merror',                              # multiclass - merror, mlogloss
    'base_score': 0.5,
    'booster': 'gbtree',                                  # gbtree, dart
    'tree_method': 'auto',                                # auto, exact, approx, hist and gpu_hist
    'importance_type': 'gain',                            # default“gain”,“weight”,“cover”,“total_gain”,“total_cover”
    'gamma': 0,                                           # larger - more conservative, [0, inf]
    'reg_alpha': 1.5,                                     # L1 reg., larger - more conservative
    'reg_lambda': 1,                                      # L2 rreg., larger - more conservative
    'sampling_method': 'uniform',                         # uniform, gradient_based
    'max_delta_step': 1,                                  # 1-10
    'min_child_weight': 1,
    'subsample': 1.0,                                     # 0-1    
    'colsample_bylevel': 1.0,                             # 0-1
    'colsample_bynode': 1.0,                              # optimized for higher recall
    'colsample_bytree': 1.0,                              # 0-1  
    'seed': 2,
    'num_class': 12,
    #'use_label_encoder': False,
    'random_state': random_state,
    'n_jobs': -1,    
}

In [148]:
vect_params = {
    'max_df': 1.0,
    'min_df': 8,            # best 8
    'analyzer': 'char',
    'ngram_range': (1,4),   # best (1,4)
    'binary': True,
    'stop_words': 'english',
}

In [149]:
#vectorizer = TfidfVectorizer( **vect_params )
vectorizer = CountVectorizer( **vect_params )

clf = XGBClassifier( **clf_params_xgb )


model = Pipeline( steps=[('vect', vectorizer), ('clf', clf)] )
model.fit(X_train, y_train)



In [150]:
# make predictions
y_pred_train = model.predict(X_train)
y_pred_dev   = model.predict(X_dev)

# add prediction to dataframe
df_dev['clf_pred'] = y_pred_dev
df_dev['clf_pred_emotion'] = df_dev['clf_pred'].map( key2label )

# labels for classification report
labels = list(label2key.keys())
print('Labels:', labels)

Labels: ['neutral', 'joy', 'trust', 'disgust', 'optimism', 'anticipation', 'sadness', 'fear', 'surprise', 'anger', 'pessimism', 'love']


In [151]:
# print classification reports
print('Vectorizer:\n', model['vect'], '\n', sep='')
print('Classifier:\n', model['clf'], '\n', sep='')

print('\nTRAINSET')
print( classification_report( y_train, y_pred_train, target_names=labels, digits=4 ) )

print('DEVSET')
print( classification_report( y_dev, y_pred_dev, target_names=labels, digits=4 ) )

Vectorizer:
CountVectorizer(analyzer='char', binary=True, min_df=8, ngram_range=(1, 4),
                stop_words='english')

Classifier:
XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1.0, colsample_bynode=1.0, colsample_bytree=1.0,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric='merror', feature_types=None, gamma=0, gpu_id=None,
              grow_policy=None, importance_type='gain',
              interaction_constraints=None, learning_rate=0.3, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=1,
              max_depth=6, max_leaves=None, min_child_weight=1, missing=nan,
              monotone_constraints=None, n_estimators=100, n_jobs=-1,
              num_class=12, num_parallel_tree=None, objective='multi:softmax', ...)


TRAINSET
              precision    recall  f1-score   support

     neutral     0.8907    0.8546    0.8722      2097
      

In [160]:
# LOAD THE FILE TO WHICH THE LABELS LEAKED FROM THE TRAINING SET HAVE ALREADY BEEN TRANSFERRED
file = 'data/df_test_with_leakedData_only.pkl'
df_test = pd.read_pickle(file)
print(df_test.isna().sum())
df_test.head()

text               0
gpt_embedding      0
text_clean         0
Emotion          607
dtype: int64


Unnamed: 0,text,gpt_embedding,text_clean,Emotion
0,Razia bta rahe the but wo sure nahe the,"[0.00961806159466505, -0.009336333721876144, 0...",Razia bta rahe the but wo sure nahe the,
1,Me phr kuch parh hi lun :-P,"[0.004914113786071539, -0.011376334354281425, ...",Me phr kuch parh hi lun :- P,
2,Hoxtl life ma hm bht jald matur hO jaty hai,"[0.006094285752624273, -0.002856901613995433, ...",Hoxtl life ma hm bht jald matur hO jaty hai,fear
3,Yar A4 me seminar ha a ja..,"[-0.018210574984550476, -0.021075459197163582,...",Yar A 4 me seminar ha a ja ..,neutral
4,K.Quid e azam k 400 bnay hain,"[-0.0046786158345639706, -0.009765759110450745...",K . Quid e azam k 400 bnay hain,neutral


In [155]:
# make prediction
preds = model.predict(df_test['text'].values)
df_test['pred'] = preds
df_test['pred'] = df_test['pred'].map( key2label )
df_test.head(25)

Unnamed: 0,text,gpt_embedding,text_clean,Emotion,pred
0,Razia bta rahe the but wo sure nahe the,"[0.00961806159466505, -0.009336333721876144, 0...",Razia bta rahe the but wo sure nahe the,,disgust
1,Me phr kuch parh hi lun :-P,"[0.004914113786071539, -0.011376334354281425, ...",Me phr kuch parh hi lun :- P,,joy
2,Hoxtl life ma hm bht jald matur hO jaty hai,"[0.006094285752624273, -0.002856901613995433, ...",Hoxtl life ma hm bht jald matur hO jaty hai,fear,disgust
3,Yar A4 me seminar ha a ja..,"[-0.018210574984550476, -0.021075459197163582,...",Yar A 4 me seminar ha a ja ..,neutral,neutral
4,K.Quid e azam k 400 bnay hain,"[-0.0046786158345639706, -0.009765759110450745...",K . Quid e azam k 400 bnay hain,neutral,neutral
5,Hm sb b a jate nd aur b bare bnde b a jate! :-D,"[-0.009173348546028137, -0.0036938106641173363...",Hm sb b a jate nd aur b bare bnde b a jate ! :- D,,joy
6,Yr apnay senior ka method ya hota ha k wo all ...,"[0.012461582198739052, 0.004763346165418625, 0...",Yr apnay senior ka method ya hota ha k wo all ...,neutral,trust
7,thek ha sir i will be at your office at 1,"[-0.005030790343880653, -0.015968872234225273,...",thek ha sir i will be at your office at 1,,optimism
8,Hahahaha tuj ma agr itni wafa ha to tu he sab ...,"[-0.003401822643354535, -0.008568023331463337,...",Hahahaha tuj ma agr itni wafa ha to tu he sab ...,,joy
9,Well you didnt told me before that the meeting...,"[-0.005642556585371494, -0.004453025758266449,...",Well you didnt told me before that the meeting...,,optimism


In [156]:
# transfer only the unknown labels 
def transfer_label(row):
    if not isinstance(row['Emotion'], str) and pd.isnull(row['Emotion']):
        row['Emotion'] = row['pred']
    return row


df_test = df_test.apply( transfer_label, axis=1 )
print(df_test.isna().sum())
df_test['Emotion'].value_counts()

text             0
gpt_embedding    0
text_clean       0
Emotion          0
pred             0
dtype: int64


neutral         401
trust           127
optimism        123
joy             121
anticipation    112
disgust          98
sadness          72
fear             57
pessimism        22
anger            20
surprise         20
love             18
Name: Emotion, dtype: int64

In [159]:
# save for official submission
file = 'data/predictions_MCEC.csv'
df_test['Emotion'].to_csv( file, index=False, encoding='utf-8' )

# HP fine-tuning

In [144]:
# search for one optimal hepyparameter
labels = list(label2key.keys())
res = []
params = list(range(40, 101, 5))
for param in reversed(params):

    clf_params_xgb2 = {
        'n_estimators': 100,
        'max_depth': 6,                 # maybe 4 w/gamma=0.55 (train accu=0.9 vs. 0.97 for depth 6)
        'learning_rate': 0.3,                                 # eta  
        'objective': 'multi:softmax',                         # multi:softmax, multi:softprob, rank:pairwise
        'eval_metric': 'merror',                              # multiclass - merror, mlogloss
        'base_score': 0.5,
        'booster': 'gbtree',                                  # gbtree, dart
        'tree_method': 'auto',                                # auto, exact, approx, hist and gpu_hist
        'importance_type': 'gain',                            # default“gain”,“weight”,“cover”,“total_gain”,“total_cover”
        'gamma': 0,                                           # larger - more conservative, [0, inf]
        'reg_alpha': 1.5,                                     # L1 reg., larger - more conservative
        'reg_lambda': 1,                                      # L2 rreg., larger - more conservative
        'sampling_method': 'uniform',                         # uniform, gradient_based
        'max_delta_step': 1,                                  # 1-10
        'min_child_weight': 1,
        'subsample': param/100,                               # 0-1    
        'colsample_bylevel': 1.0,                             # 0-1
        'colsample_bynode': 1.0,                              # optimized for higher recall
        'colsample_bytree': 1.0,                              # 0-1  
        'seed': 2,
        'num_class': 12,
        #'use_label_encoder': False,
        'random_state': random_state,
        'n_jobs': -1,    
    }

    vect_params2 = {
        'max_df': 1.0,
        'min_df': 8,            # best 8
        'analyzer': 'char',
        'ngram_range': (1,4),   # best (1,4)
        'binary': True,
        'stop_words': 'english',
    }

    vectorizer = CountVectorizer( **vect_params2 )
    clf        = XGBClassifier( **clf_params_xgb2 )
    model      = Pipeline( steps=[('vect', vectorizer), ('clf', clf)] )
    model.fit(X_train, y_train)

    # make predictions
    y_pred_train = model.predict(X_train)
    y_pred_dev   = model.predict(X_dev)

    # print classification reports
    print('Parameter:', param/100, '\n')
    print('\nTRAINSET')
    print( classification_report( y_train, y_pred_train, target_names=labels, digits=4 ) )
    clf_rep1 = classification_report( y_train, y_pred_train, target_names=labels, output_dict=True )

    print('DEVSET')
    print( classification_report( y_dev, y_pred_dev, target_names=labels, digits=4 ) )
    clf_rep2 = classification_report( y_dev, y_pred_dev, target_names=labels, output_dict=True )
    
    res.append([ clf_rep1['macro avg']['f1-score'], clf_rep1['accuracy'],
                 clf_rep2['macro avg']['f1-score'], clf_rep2['accuracy'], param, ])
    
    print('\n', '='*78, '\n')



Parameter: 1.0 


TRAINSET
              precision    recall  f1-score   support

     neutral     0.8907    0.8546    0.8722      2097
         joy     0.9744    0.9800    0.9772      2097
       trust     0.9641    0.9361    0.9499      2097
     disgust     0.9741    0.9695    0.9718      2097
    optimism     0.9495    0.9676    0.9584      2097
anticipation     0.9262    0.9461    0.9361      2097
     sadness     0.9867    0.9905    0.9886      2097
        fear     0.9915    0.9990    0.9952      2097
    surprise     0.9957    1.0000    0.9979      2097
       anger     0.9976    1.0000    0.9988      2097
   pessimism     0.9938    1.0000    0.9969      2097
        love     0.9962    1.0000    0.9981      2097

    accuracy                         0.9703     25164
   macro avg     0.9700    0.9703    0.9701     25164
weighted avg     0.9700    0.9703    0.9701     25164

DEVSET
              precision    recall  f1-score   support

     neutral     0.5255    0.5851    0.5537 



Parameter: 0.95 


TRAINSET
              precision    recall  f1-score   support

     neutral     0.8918    0.8450    0.8678      2097
         joy     0.9720    0.9762    0.9741      2097
       trust     0.9599    0.9356    0.9476      2097
     disgust     0.9723    0.9714    0.9719      2097
    optimism     0.9367    0.9671    0.9517      2097
anticipation     0.9222    0.9385    0.9303      2097
     sadness     0.9890    0.9881    0.9885      2097
        fear     0.9910    0.9990    0.9950      2097
    surprise     0.9943    1.0000    0.9971      2097
       anger     0.9976    1.0000    0.9988      2097
   pessimism     0.9938    1.0000    0.9969      2097
        love     0.9971    1.0000    0.9986      2097

    accuracy                         0.9684     25164
   macro avg     0.9682    0.9684    0.9682     25164
weighted avg     0.9682    0.9684    0.9682     25164

DEVSET
              precision    recall  f1-score   support

     neutral     0.4906    0.5387    0.5135



Parameter: 0.9 


TRAINSET
              precision    recall  f1-score   support

     neutral     0.8757    0.8431    0.8591      2097
         joy     0.9707    0.9785    0.9746      2097
       trust     0.9611    0.9318    0.9462      2097
     disgust     0.9702    0.9628    0.9665      2097
    optimism     0.9473    0.9680    0.9575      2097
anticipation     0.9196    0.9385    0.9290      2097
     sadness     0.9862    0.9914    0.9888      2097
        fear     0.9957    0.9957    0.9957      2097
    surprise     0.9943    1.0000    0.9971      2097
       anger     0.9967    1.0000    0.9983      2097
   pessimism     0.9934    1.0000    0.9967      2097
        love     0.9962    1.0000    0.9981      2097

    accuracy                         0.9675     25164
   macro avg     0.9673    0.9675    0.9673     25164
weighted avg     0.9673    0.9675    0.9673     25164

DEVSET
              precision    recall  f1-score   support

     neutral     0.5123    0.5361    0.5239 



Parameter: 0.85 


TRAINSET
              precision    recall  f1-score   support

     neutral     0.8876    0.8512    0.8690      2097
         joy     0.9710    0.9747    0.9729      2097
       trust     0.9479    0.9361    0.9419      2097
     disgust     0.9713    0.9690    0.9702      2097
    optimism     0.9486    0.9595    0.9540      2097
anticipation     0.9270    0.9390    0.9330      2097
     sadness     0.9830    0.9914    0.9872      2097
        fear     0.9948    0.9943    0.9945      2097
    surprise     0.9953    1.0000    0.9976      2097
       anger     0.9971    1.0000    0.9986      2097
   pessimism     0.9943    1.0000    0.9971      2097
        love     0.9938    1.0000    0.9969      2097

    accuracy                         0.9679     25164
   macro avg     0.9676    0.9679    0.9677     25164
weighted avg     0.9676    0.9679    0.9677     25164

DEVSET
              precision    recall  f1-score   support

     neutral     0.4930    0.5464    0.5183



Parameter: 0.8 


TRAINSET
              precision    recall  f1-score   support

     neutral     0.8772    0.8450    0.8608      2097
         joy     0.9697    0.9766    0.9732      2097
       trust     0.9610    0.9399    0.9503      2097
     disgust     0.9717    0.9661    0.9689      2097
    optimism     0.9425    0.9619    0.9521      2097
anticipation     0.9213    0.9380    0.9296      2097
     sadness     0.9871    0.9871    0.9871      2097
        fear     0.9948    0.9948    0.9948      2097
    surprise     0.9962    1.0000    0.9981      2097
       anger     0.9967    1.0000    0.9983      2097
   pessimism     0.9929    1.0000    0.9964      2097
        love     0.9957    1.0000    0.9979      2097

    accuracy                         0.9675     25164
   macro avg     0.9672    0.9675    0.9673     25164
weighted avg     0.9672    0.9675    0.9673     25164

DEVSET
              precision    recall  f1-score   support

     neutral     0.5117    0.5619    0.5356 



Parameter: 0.75 


TRAINSET
              precision    recall  f1-score   support

     neutral     0.8840    0.8398    0.8613      2097
         joy     0.9785    0.9771    0.9778      2097
       trust     0.9568    0.9394    0.9480      2097
     disgust     0.9699    0.9666    0.9682      2097
    optimism     0.9429    0.9690    0.9558      2097
anticipation     0.9269    0.9428    0.9348      2097
     sadness     0.9886    0.9924    0.9905      2097
        fear     0.9929    0.9962    0.9945      2097
    surprise     0.9934    1.0000    0.9967      2097
       anger     0.9967    1.0000    0.9983      2097
   pessimism     0.9901    1.0000    0.9950      2097
        love     0.9990    1.0000    0.9995      2097

    accuracy                         0.9686     25164
   macro avg     0.9683    0.9686    0.9684     25164
weighted avg     0.9683    0.9686    0.9684     25164

DEVSET
              precision    recall  f1-score   support

     neutral     0.4953    0.5490    0.5208



Parameter: 0.7 


TRAINSET
              precision    recall  f1-score   support

     neutral     0.8850    0.8441    0.8640      2097
         joy     0.9715    0.9747    0.9731      2097
       trust     0.9565    0.9428    0.9496      2097
     disgust     0.9698    0.9642    0.9670      2097
    optimism     0.9442    0.9762    0.9599      2097
anticipation     0.9210    0.9285    0.9247      2097
     sadness     0.9862    0.9862    0.9862      2097
        fear     0.9919    0.9962    0.9941      2097
    surprise     0.9962    1.0000    0.9981      2097
       anger     0.9971    1.0000    0.9986      2097
   pessimism     0.9934    1.0000    0.9967      2097
        love     0.9967    1.0000    0.9983      2097

    accuracy                         0.9677     25164
   macro avg     0.9675    0.9677    0.9675     25164
weighted avg     0.9675    0.9677    0.9675     25164

DEVSET
              precision    recall  f1-score   support

     neutral     0.5012    0.5464    0.5228 



Parameter: 0.65 


TRAINSET
              precision    recall  f1-score   support

     neutral     0.8852    0.8455    0.8649      2097
         joy     0.9707    0.9809    0.9758      2097
       trust     0.9643    0.9390    0.9514      2097
     disgust     0.9715    0.9747    0.9731      2097
    optimism     0.9439    0.9700    0.9567      2097
anticipation     0.9269    0.9309    0.9289      2097
     sadness     0.9876    0.9909    0.9893      2097
        fear     0.9933    0.9957    0.9945      2097
    surprise     0.9934    1.0000    0.9967      2097
       anger     0.9976    1.0000    0.9988      2097
   pessimism     0.9915    1.0000    0.9957      2097
        love     0.9981    1.0000    0.9990      2097

    accuracy                         0.9690     25164
   macro avg     0.9687    0.9690    0.9687     25164
weighted avg     0.9687    0.9690    0.9687     25164

DEVSET
              precision    recall  f1-score   support

     neutral     0.5157    0.5515    0.5330



Parameter: 0.6 


TRAINSET
              precision    recall  f1-score   support

     neutral     0.8705    0.8431    0.8566      2097
         joy     0.9701    0.9742    0.9722      2097
       trust     0.9602    0.9437    0.9519      2097
     disgust     0.9685    0.9661    0.9673      2097
    optimism     0.9450    0.9590    0.9520      2097
anticipation     0.9239    0.9380    0.9309      2097
     sadness     0.9857    0.9866    0.9862      2097
        fear     0.9943    0.9919    0.9931      2097
    surprise     0.9938    1.0000    0.9969      2097
       anger     0.9971    1.0000    0.9986      2097
   pessimism     0.9953    1.0000    0.9976      2097
        love     0.9957    1.0000    0.9979      2097

    accuracy                         0.9669     25164
   macro avg     0.9667    0.9669    0.9668     25164
weighted avg     0.9667    0.9669    0.9668     25164

DEVSET
              precision    recall  f1-score   support

     neutral     0.4929    0.5361    0.5136 



Parameter: 0.55 


TRAINSET
              precision    recall  f1-score   support

     neutral     0.8665    0.8355    0.8507      2097
         joy     0.9681    0.9695    0.9688      2097
       trust     0.9631    0.9323    0.9474      2097
     disgust     0.9682    0.9590    0.9636      2097
    optimism     0.9350    0.9599    0.9473      2097
anticipation     0.9162    0.9380    0.9270      2097
     sadness     0.9866    0.9862    0.9864      2097
        fear     0.9929    0.9981    0.9955      2097
    surprise     0.9938    1.0000    0.9969      2097
       anger     0.9967    1.0000    0.9983      2097
   pessimism     0.9920    1.0000    0.9960      2097
        love     0.9971    1.0000    0.9986      2097

    accuracy                         0.9649     25164
   macro avg     0.9647    0.9649    0.9647     25164
weighted avg     0.9647    0.9649    0.9647     25164

DEVSET
              precision    recall  f1-score   support

     neutral     0.4965    0.5490    0.5214



Parameter: 0.5 


TRAINSET
              precision    recall  f1-score   support

     neutral     0.8663    0.8278    0.8466      2097
         joy     0.9720    0.9766    0.9743      2097
       trust     0.9575    0.9351    0.9462      2097
     disgust     0.9644    0.9571    0.9607      2097
    optimism     0.9432    0.9657    0.9543      2097
anticipation     0.9197    0.9399    0.9297      2097
     sadness     0.9871    0.9881    0.9876      2097
        fear     0.9933    0.9962    0.9948      2097
    surprise     0.9948    1.0000    0.9974      2097
       anger     0.9962    1.0000    0.9981      2097
   pessimism     0.9901    1.0000    0.9950      2097
        love     0.9981    1.0000    0.9990      2097

    accuracy                         0.9655     25164
   macro avg     0.9652    0.9655    0.9653     25164
weighted avg     0.9652    0.9655    0.9653     25164

DEVSET
              precision    recall  f1-score   support

     neutral     0.4904    0.5284    0.5087 



Parameter: 0.45 


TRAINSET
              precision    recall  f1-score   support

     neutral     0.8802    0.8412    0.8603      2097
         joy     0.9684    0.9781    0.9732      2097
       trust     0.9527    0.9404    0.9465      2097
     disgust     0.9631    0.9590    0.9611      2097
    optimism     0.9461    0.9619    0.9539      2097
anticipation     0.9251    0.9418    0.9334      2097
     sadness     0.9914    0.9895    0.9905      2097
        fear     0.9924    0.9948    0.9936      2097
    surprise     0.9943    1.0000    0.9971      2097
       anger     0.9953    1.0000    0.9976      2097
   pessimism     0.9957    1.0000    0.9979      2097
        love     0.9986    1.0000    0.9993      2097

    accuracy                         0.9672     25164
   macro avg     0.9669    0.9672    0.9670     25164
weighted avg     0.9669    0.9672    0.9670     25164

DEVSET
              precision    recall  f1-score   support

     neutral     0.4807    0.5464    0.5115



Parameter: 0.4 


TRAINSET
              precision    recall  f1-score   support

     neutral     0.8681    0.8379    0.8527      2097
         joy     0.9700    0.9719    0.9709      2097
       trust     0.9598    0.9342    0.9468      2097
     disgust     0.9664    0.9595    0.9629      2097
    optimism     0.9336    0.9661    0.9496      2097
anticipation     0.9206    0.9285    0.9245      2097
     sadness     0.9872    0.9900    0.9886      2097
        fear     0.9886    0.9924    0.9905      2097
    surprise     0.9953    1.0000    0.9976      2097
       anger     0.9981    1.0000    0.9990      2097
   pessimism     0.9938    1.0000    0.9969      2097
        love     0.9962    1.0000    0.9981      2097

    accuracy                         0.9650     25164
   macro avg     0.9648    0.9650    0.9649     25164
weighted avg     0.9648    0.9650    0.9649     25164

DEVSET
              precision    recall  f1-score   support

     neutral     0.4942    0.5515    0.5213 

In [146]:
# review results
res = sorted(res, key=lambda x: x[2], reverse=True)
for i in res:
    print(i)

[0.9700934669687205, 0.970274996026069, 0.3503155943511486, 0.4508816120906801, 100]
[0.9683748207692101, 0.9686059450007948, 0.3343079157672156, 0.4357682619647355, 75]
[0.967742438920736, 0.96793037672866, 0.3320877010346391, 0.4324097397145256, 85]
[0.9672918461644503, 0.9674535050071531, 0.33163539380150825, 0.44080604534005036, 80]
[0.9673085085663319, 0.9674932443172787, 0.331150312582271, 0.42737195633921077, 90]
[0.968739745640386, 0.968963598791925, 0.33091337717551106, 0.4332493702770781, 65]
[0.9648524688070638, 0.9650294070894929, 0.32880852700348967, 0.4332493702770781, 40]
[0.968186148724008, 0.9684072484501669, 0.32873200127580676, 0.42737195633921077, 95]
[0.9670162290710995, 0.9672150691463997, 0.3224758890213447, 0.42989084802686817, 45]
[0.967520145105364, 0.9677316801780321, 0.32100839322477714, 0.4324097397145256, 70]
[0.9646998331946658, 0.9648704498489906, 0.31980562516758676, 0.4282115869017632, 55]
[0.9653166395000299, 0.9655460181211254, 0.3100596791754279, 0.

## Log of some best results

__Obsertvations__:
* tfidf is more overfit, runs longer, macro-F1 is lower than Countverctorizer
* Counvectorizer (True) is least overfit (0.97)

In [None]:
# BEST HYPERPARAMETERS
clf_params_xgb = {
    'n_estimators': 100,
    'max_depth': 6,                 # maybe 4 w/gamma=0.55 (train accu=0.9 vs. 0.97 for depth 6)
    'learning_rate': 0.3,                                 # eta  
    'objective': 'multi:softmax',                         # multi:softmax, multi:softprob, rank:pairwise
    'eval_metric': 'merror',                              # multiclass - merror, mlogloss
    'base_score': 0.5,
    'booster': 'gbtree',                                  # gbtree, dart
    'tree_method': 'auto',                                # auto, exact, approx, hist and gpu_hist
    'importance_type': 'gain',                            # default“gain”,“weight”,“cover”,“total_gain”,“total_cover”
    'gamma': 0,                                           # larger - more conservative, [0, inf]
    'reg_alpha': 1.5,                                     # L1 reg., larger - more conservative
    'reg_lambda': 1,                                      # L2 rreg., larger - more conservative
    'sampling_method': 'uniform',                         # uniform, gradient_based
    'max_delta_step': 1,                                  # 1-10
    'min_child_weight': 1,
    'subsample': 1.0,                                     # 0-1    
    'colsample_bylevel': 1.0,                             # 0-1
    'colsample_bynode': 1.0,                              # optimized for higher recall
    'colsample_bytree': 1.0,                              # 0-1  
    'seed': 2,
    'num_class': 12,
    #'use_label_encoder': False,
    'random_state': random_state,
    'n_jobs': -1,    
}

vect_params = {
    'max_df': 1.0,
    'min_df': 8,            # best 8
    'analyzer': 'char',
    'ngram_range': (1,4),   # best (1,4)
    'binary': True,
    'stop_words': 'english',
}

__Best results with the best hyperparameters__:
```
TRAINSET
              precision    recall  f1-score   support

     neutral     0.8907    0.8546    0.8722      2097
         joy     0.9744    0.9800    0.9772      2097
       trust     0.9641    0.9361    0.9499      2097
     disgust     0.9741    0.9695    0.9718      2097
    optimism     0.9495    0.9676    0.9584      2097
anticipation     0.9262    0.9461    0.9361      2097
     sadness     0.9867    0.9905    0.9886      2097
        fear     0.9915    0.9990    0.9952      2097
    surprise     0.9957    1.0000    0.9979      2097
       anger     0.9976    1.0000    0.9988      2097
   pessimism     0.9938    1.0000    0.9969      2097
        love     0.9962    1.0000    0.9981      2097

    accuracy                         0.9703     25164
   macro avg     0.9700    0.9703    0.9701     25164
weighted avg     0.9700    0.9703    0.9701     25164

DEVSET
              precision    recall  f1-score   support

     neutral     0.5255    0.5851    0.5537       388
         joy     0.7273    0.7328    0.7300       131
       trust     0.3898    0.3680    0.3786       125
     disgust     0.2778    0.2655    0.2715       113
    optimism     0.4393    0.4273    0.4332       110
anticipation     0.3223    0.4149    0.3628        94
     sadness     0.3492    0.3548    0.3520        62
        fear     0.2281    0.2500    0.2385        52
    surprise     0.3125    0.1429    0.1961        35
       anger     0.2857    0.1143    0.1633        35
   pessimism     0.2143    0.1034    0.1395        29
        love     0.5556    0.2941    0.3846        17

    accuracy                         0.4509      1191
   macro avg     0.3856    0.3378    0.3503      1191
weighted avg     0.4433    0.4509    0.4430      1191
```

__Another good results__  
analyzer='char', Counvectorizer, binary=True, ngram_range=(1,4), LR=0.3
```
TRAINSET
              precision    recall  f1-score   support
                            
    accuracy                         0.9768     25164
   macro avg     0.9767    0.9768    0.9767     25164
weighted avg     0.9767    0.9768    0.9767     25164

DEVSET
              precision    recall  f1-score   support

     neutral     0.4989    0.5825    0.5375       388
         joy     0.6828    0.7557    0.7174       131
       trust     0.4000    0.3840    0.3918       125
     disgust     0.2315    0.2212    0.2262       113
    optimism     0.4706    0.4364    0.4528       110
anticipation     0.3063    0.3617    0.3317        94
     sadness     0.3509    0.3226    0.3361        62
        fear     0.2917    0.2692    0.2800        52
    surprise     0.3750    0.0857    0.1395        35
       anger     0.3333    0.1143    0.1702        35
   pessimism     0.1667    0.1034    0.1277        29
        love     0.6667    0.3529    0.4615        17

    accuracy                         0.4450      1191
   macro avg     0.3979    0.3325    0.3477      1191
weighted avg     0.4346    0.4450    0.4331      1191
```