# EMO BASELINE
## ACL 2023 Conference
## WASSA 2023 Shared Task on Empathy, Emotion, and Personality Detection in Interactions
More details [here](https://codalab.lisn.upsaclay.fr/competitions/11167#learn_the_details)

In [1]:
import numpy as np
import pandas as pd
import sklearn
import re, os
import ftfy
import pycld2 as cld2
import time
from typing import List
from copy import deepcopy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier, RidgeClassifierCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import classification_report, multilabel_confusion_matrix

import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 400)

import warnings
warnings.filterwarnings("ignore")

#os.path.join()

In [2]:
class RidgeClassifierCVProba(RidgeClassifierCV):
    '''
        This lets RidgeClassifierCV() output probabilities with predict_proba()
    '''
    def predict_proba(self, X):
        d = self.decision_function(X)
        return np.exp(d) / np.sum(np.exp(d))
    
class RidgeClassifierProba(RidgeClassifier):
    '''
        This lets RidgeClassifier() output probabilities with predict_proba()
    '''
    def predict_proba(self, X):
        d = self.decision_function(X)
        return np.exp(d) / np.sum(np.exp(d))

In [3]:
from sklearn.base import TransformerMixin

class DenseTransformer(TransformerMixin):

    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        return X.todense()
    
# model = Pipeline( steps=[('vect', vectorizer), ('to_dense', DenseTransformer()), ('clf', clf)] )

In [4]:
multi_spaces = re.compile('\s{2,}')

def clean_text(s):
    if not isinstance(s, str):
        return s
    for char in ['�', '•']:
        if char in s:
            s = s.replace(char, ' ')
    s = ftfy.fix_text(s)
    
    #s = clean.sub(' ', s.lower())
    s = multi_spaces.sub(' ', s)
        
    return s.strip()

In [5]:
def detect_lang( t ):
    '''
        Return the language(s) in string s.
        Naive Bayes classifier under the hood -
        results are less certain for strings that are too short.
        Returns up to three languages with confidence scores.
        More on usage: https://pypi.org/project/pycld2/
    '''
    _, _, details = cld2.detect( ftfy.fix_text( t ) )
    return details[0][0]

In [6]:
def get_target(emotions: List[str])->List[int]:
    '''
        Convert list of strings with categories into list of 0s and 1s with length 8 because there are 8 categories;
        1 in the i-th position means that this essay belongs to the i-th category as in key2label[i]
    '''
    res  = [0]*8
    idxs = [label2key[e] for e in emotions]    
    for idx in idxs:
        res[idx] = 1
    return res

In [7]:
# target variables
label2key = {   
    'Anger':    0,
    'Disgust':  1,
    'Fear':     2,
    'Hope':     3,    
    'Joy':      4,
    'Neutral':  5,
    'Sadness':  6,
    'Surprise': 7,
}
key2label = {v: k for k,v in label2key.items()}
print(key2label)

{0: 'Anger', 1: 'Disgust', 2: 'Fear', 3: 'Hope', 4: 'Joy', 5: 'Neutral', 6: 'Sadness', 7: 'Surprise'}


In [8]:
# new new version (Dec 2022)
def upsample_all( df_, labels_col='target', random_state=47 ):
    '''
        Upsample each class in column labels_col of pandas dataframe df_
        to the number of data points in majority class
    '''
    # get sub-dataframes for each class & max length
    labels = df_[labels_col].unique()
    dframes, df_lengths = dict(), dict()
    for i in labels:
        temp          = df_[ df_[labels_col] == i ]
        dframes[i]    = temp.copy()
        df_lengths[i] = len(temp)

    max_len = max( list(df_lengths.values()) )
    df_lengths = {k: max_len-v for k,v in df_lengths.items()}                     # difference - how many to resample

    # upsample with replacement to max length
    for i in labels:
        if df_lengths[i] == max_len:
            dframes[i] = dframes[i].sample( frac=1, random_state=random_state )      # we know it's overrepresented
        else:
            if len(dframes[i]) >= df_lengths[i]:
                replace = False                                                      # enough data points
            else:
                replace = True
            temp = dframes[i].sample( df_lengths[i], replace=replace, random_state=random_state )
            dframes[i] = pd.concat( [dframes[i].copy(), temp.copy()] )               # df len + (max_len-df len)
            dframes[i] = dframes[i].sample( frac=1, random_state=random_state )      # shuffle

    # combine and reshuffle
    df_merged = pd.concat( list(dframes.values()) )
    df_merged = df_merged.sample( frac=1, random_state=random_state ).reset_index(drop=True)

    return df_merged

In [9]:
random_state = 47

In [10]:
words_8cats =      [ "'s", 'a', 'about', 'after', 'again', 'all', 'am', 'america', 'an', 'and', 'animal', 'animals',
                    'are', 'around', 'as', 'at', 'bad', 'be', 'because', 'but', 'by', 'can', 'children',
                    'crazy', 'death', 'do', 'even', 'find', 'for', 'from', 'get', 'go', 'had', 'has', 'have',
                    'having', 'he', 'his', 'horrible', 'how', 'i', 'if', 'in', 'is', 'it', 'its', 'just',
                    'kill', 'killed', 'know', 'like', 'live', 'life', 'lives', 'lived', 'm', 'make', 'makes',
                    'man', 'me', 'mind', 'more', 'most', 'much', 'my', 'need', 'never', 'no', 'not', 'now',
                    'of', 'on', 'one', 'or', 'other', 'out', 'people', 'place', 'put', 'really', 'sad', 'see',
                    'seems', 'situation', 'so', 'some', 'something', 'species', 'stop', 'story',
                    'such', 't', 'take', 'that', 'the', 'their', 'them', 'then', 'there', 'these', 'they', 'thing',
                    'things', 'think', 'this', 'time', 'to', 'type', 'up', 'us', 'very', 'war', 'was', 'way',
                    'we', 'were', 'what', 'when', 'with', 'worse', 'would', 'you',
                   ]

words_7cats      = [ 'age', 'air', 'also', 'always', 'any', 'article', 'attack', 'away', 'back', 'been', 'before',
                     'being', 'believe', 'both', 'cause', 'child', 'could', 'country', 'day', 'deal', 'did', 'die',
                     'disease', 'done', 'down', 'during', 'dying', 'each', 'either', 'end', 'facing', 'feel',
                     'felt', 'first', 'food', 'future', 'girl', 'glad', 'going', 'good', 'government', 'great',
                     'guess', 'happened', 'happening', 'hard', 'harm', 'hate', 'her', 'high', 'him', 'humans',
                     'imagine', 'instead', 'interesting', 'job', 'jobs', 'keep', 'kids', 'leave', 'left', 'let',
                     'life', 'living', 'lost', 'lot', 'make', 'many', 'needs', 'new', 'normal', 'often', 'oil',
                     'only', 'over', 'pain', 'person', 'places', 'poor', 'population', 'probably', 'problem',
                     'protect', 'read', 'reading', 'real', 'same', 'say', 'she', 'should', 'show', 'sick',
                     'society', 'someone', 'sounds', 'start', 'still', 'suffering', 'sure', 'terrible',
                     'thinking', 'those', 'though', 'thought', 'twice', 'under', 'water', 'were', 'where',
                     'which', 'who', 'whole', 'why', 'wildlife', 'will', 'woman', 'wonder', 'world', 'worried',
                     'years', 'your', ]

experimental_sw = words_7cats + words_8cats

In [11]:
# COMMON STOPWORDS
from sklearn.feature_extraction import _stop_words    
from nltk.corpus import stopwords                    
 
print('Sklearn:')
stopwords_sklearn = list(_stop_words.ENGLISH_STOP_WORDS)        # 318 words
print(len(stopwords_sklearn))
print(stopwords_sklearn)

print('\nNLTK:')
stopwords_nltk = list(stopwords.words('english'))              # 180 words
print(len(stopwords_nltk))
print(stopwords_nltk)

print('\nLemur')                                               # 430 words
stopwords_lemur = []
with open('data/lemur_stopwords.txt') as f:
    for line in f:
        line = line.strip()
        stopwords_lemur.append(line)
print(len(stopwords_lemur))
print(stopwords_lemur)

print('\nOther:')                                              # 153 words
stopwords_other = [ "i", "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]
print(len(stopwords_other))
print(stopwords_other)

print('\nCOMBINED:')                                           # 579 words
stopwords_combined = list(set(stopwords_sklearn + stopwords_nltk + stopwords_lemur + stopwords_other))
print(len(stopwords_combined))
print(stopwords_combined)

Sklearn:
318
['enough', 'often', 'con', 'whence', 'them', 'been', 'anyhow', 'even', 'whither', 'everywhere', 'about', 'themselves', 'somehow', 'others', 'eleven', 'fill', 'under', 'forty', 'same', 'seemed', 'between', 'however', 'whereupon', 'has', 'besides', 'latterly', 'which', 'else', 'am', 'part', 'the', 'made', 'nothing', 'not', 'since', 'take', 'least', 'becoming', 'everyone', 'own', 'empty', 'sometime', 'those', 'many', 'seeming', 'further', 'hereby', 'up', 'both', 'cant', 'ours', 'another', 'three', 'until', 'become', 'who', 'seems', 'two', 'could', 'none', 'her', 'on', 'what', 'de', 'beside', 'and', 'sincere', 'our', 'us', 'wherever', 'un', 'sometimes', 'is', 'when', 'move', 'although', 're', 'a', 'elsewhere', 'toward', 'much', 'per', 'together', 'also', 'few', 'never', 'that', 'therefore', 'inc', 'therein', 'down', 'whoever', 'next', 'perhaps', 'always', 'whether', 'amount', 'describe', 'nobody', 'either', 'anyone', 'fifteen', 'thin', 'wherein', 'should', 'everything', 'get',

# Load and prepare data

In [12]:
file1    = 'data/df_train.pkl'
df_train = pd.read_pickle(file1)

file2    = 'data/df_dev.pkl'
df_dev   = pd.read_pickle(file2)

file3    = 'data/df_augmented.pkl'
df_aug   = pd.read_pickle(file3)
df_aug['emotion']        = df_aug['emotion'].apply( lambda x: [x] if isinstance(x,str) else x)
df_aug['target_encoded'] = df_aug['emotion'].apply( get_target )

print(df_train.shape, df_dev.shape, df_aug.shape)

(792, 41) (208, 54) (165, 3)


In [13]:
df_aug.head()

Unnamed: 0,essay,emotion,target_encoded
0,The rising number of drug addiction cases is a...,[Hope],"[0, 0, 0, 1, 0, 0, 0, 0]"
1,I'm flabbergasted by this new medical treatmen...,[Surprise],"[0, 0, 0, 0, 0, 0, 0, 1]"
2,The single mother worked multiple jobs to prov...,[Joy],"[0, 0, 0, 0, 1, 0, 0, 0]"
3,I just came across an article about a recent o...,[Fear],"[0, 0, 1, 0, 0, 0, 0, 0]"
4,"She was diagnosed with a severe illness, but s...",[Joy],"[0, 0, 0, 0, 1, 0, 0, 0]"


In [14]:
# prepare additional text columns (tsk = title, summary, keywords)
df_train['essay_clean_spellchecked_tsk'] = df_train['gpt35_title'] + '. ' + df_train['gpt35_summary'] + ' ' +\
                                           df_train['gpt35_keywords'] + ' ' +\
                                           df_train['essay_clean_spellchecked']
df_train['essay_clean_spellchecked_ts']  = df_train['gpt35_title'] + '. ' + df_train['gpt35_summary'] + ' ' +\
                                           df_train['essay_clean_spellchecked']
df_train['title_summary_keywords']       = df_train['gpt35_title'] + '. ' + df_train['gpt35_summary'] + ' ' +\
                                           df_train['gpt35_keywords']
df_train['title_summary']                = df_train['gpt35_title'] + '. ' + df_train['gpt35_summary']



df_dev['essay_clean_spellchecked_tsk'] = df_dev['gpt35_title'] + '. ' + df_dev['gpt35_summary'] + ' ' +\
                                         df_dev['gpt35_keywords'] + ' ' +\
                                         df_dev['essay_clean_spellchecked']
df_dev['essay_clean_spellchecked_ts']  = df_dev['gpt35_title'] + '. ' + df_dev['gpt35_summary'] + ' ' +\
                                         df_dev['essay_clean_spellchecked']
df_dev['title_summary_keywords']       = df_dev['gpt35_title'] + '. ' + df_dev['gpt35_summary'] + ' ' +\
                                         df_dev['gpt35_keywords']
df_dev['title_summary']                = df_dev['gpt35_title'] + '. ' + df_dev['gpt35_summary']

In [15]:
# verify prepared text columns
temp = df_train[['essay_clean_spellchecked', 'essay_clean_spellchecked_tsk',
       'essay_clean_spellchecked_ts', 'title_summary_keywords',
       'title_summary']]
for a, b, c, d, e in temp.values[:10]:
    print(a)
    print(b)
    print(c)
    print(d)
    print(e)
    print('\n', '='*77, '\n')

It breaks my heart to see people living in those conditions. I hope that all the aid that was sent to the island makes it to the people who need it the most. I do not know what I would do it that was my family and I. I would hope that I would do my best, but I can see how depressing and hopeless you could feel having your whole life changed because of a storm and not knowing where your next meal is coming from.
The Heartbreaking Reality of Natural Disasters. The author expresses sadness at the living conditions of people affected by a storm and hopes that aid reaches those who need it the most. They acknowledge the difficulty of coping with such a situation and empathize with those who are struggling. heartbreak, people, living conditions, aid, island, needy, family, depression, hopelessness, storm, meal. It breaks my heart to see people living in those conditions. I hope that all the aid that was sent to the island makes it to the people who need it the most. I do not know what I woul

In [16]:
# downsample sadness to neutral
to_drop     = 181
emotion_col = 'emotion_no_2nd_neut'
mask = df_train[emotion_col].apply( lambda x: 'Sadness' in x and 'Joy' not in x and 'Surprise' not in x and\
                                              'Fear' not in x and 'Hope' not in x)
idx_to_drop = df_train[mask].sample(n=to_drop, random_state=random_state).index

print(df_train.shape)
df_train = df_train.drop( idx_to_drop )
print(df_train.shape)

(792, 45)
(611, 45)


In [17]:
candidate_cols = [ 'essay_clean', 'essay_clean_spellchecked_tsk',
                   'essay_clean_spellchecked_ts', 'title_summary_keywords',
                   'title_summary', ]
text_col    = candidate_cols[0]
print('Text column:', text_col)

Text column: essay_clean


In [18]:
# add augmented data
df_aug['essay_clean']              = df_aug['essay'].values
df_aug['essay_clean_spellchecked'] = df_aug['essay'].values
df_aug['emotion_no_2nd_neut']      = df_aug['emotion'].values
print('Misspelled column names:', [c for c in df_aug.columns if c not in df_train.columns])
df_aug.head()

Misspelled column names: []


Unnamed: 0,essay,emotion,target_encoded,essay_clean,essay_clean_spellchecked,emotion_no_2nd_neut
0,The rising number of drug addiction cases is a...,[Hope],"[0, 0, 0, 1, 0, 0, 0, 0]",The rising number of drug addiction cases is a...,The rising number of drug addiction cases is a...,[Hope]
1,I'm flabbergasted by this new medical treatmen...,[Surprise],"[0, 0, 0, 0, 0, 0, 0, 1]",I'm flabbergasted by this new medical treatmen...,I'm flabbergasted by this new medical treatmen...,[Surprise]
2,The single mother worked multiple jobs to prov...,[Joy],"[0, 0, 0, 0, 1, 0, 0, 0]",The single mother worked multiple jobs to prov...,The single mother worked multiple jobs to prov...,[Joy]
3,I just came across an article about a recent o...,[Fear],"[0, 0, 1, 0, 0, 0, 0, 0]",I just came across an article about a recent o...,I just came across an article about a recent o...,[Fear]
4,"She was diagnosed with a severe illness, but s...",[Joy],"[0, 0, 0, 0, 1, 0, 0, 0]","She was diagnosed with a severe illness, but s...","She was diagnosed with a severe illness, but s...",[Joy]


In [19]:
print('Before concatenation:', df_train.shape)
df_train = pd.concat([ df_train.copy(), df_aug.copy() ]).sample(frac=1, random_state=random_state)
print('After concatenation:', df_train.shape)
print(df_train.isna().sum())
df_train.head(25)

Before concatenation: (611, 45)
After concatenation: (776, 45)
article_id                       165
conversation_id                  165
speaker_number                   165
essay_id                         165
speaker_id                       165
essay                              0
essay_clean                        0
split                            165
gender                           165
education                        165
race                             165
age                              165
income                           165
emotion                            0
target_encoded                     0
emotion_count                    165
article                          165
article_clean                    165
essay_clean_spellchecked           0
article_clean_spellchecked       165
gpt_embedding                    165
emotion_no_2nd_neut                0
gpt35_keywords                   165
gpt35_title                      165
gpt35_summary                    165
gpt4_summary

Unnamed: 0,article_id,conversation_id,speaker_number,essay_id,speaker_id,essay,essay_clean,split,gender,education,race,age,income,emotion,target_encoded,emotion_count,article,article_clean,essay_clean_spellchecked,article_clean_spellchecked,gpt_embedding,emotion_no_2nd_neut,gpt35_keywords,gpt35_title,gpt35_summary,gpt4_summary,gpt4_title,gpt4_keywords,empathy,distress,personality_conscientiousness,personality_openess,personality_extraversion,personality_agreeableness,personality_stability,iri_perspective_taking,iri_personal_distress,iri_fantasy,iri_empathatic_concern,empathy_label,distress_label,essay_clean_spellchecked_tsk,essay_clean_spellchecked_ts,title_summary_keywords,title_summary
233,339.0,300.0,1.0,299.0,19.0,Why is the rest of the world just sitting arou...,Why is the rest of the world just sitting arou...,train,1,6,2,32,35000,[Disgust],"[0, 1, 0, 0, 0, 0, 0, 0]",1.0,The treatment of Calais camp children isn’t ju...,The treatment of Calais camp children isn't ju...,Why is the rest of the world just sitting arou...,The treatment of Calais camp children isn't ju...,"[-0.00291491043753922, -0.019601348787546158, ...",[Disgust],"rest of the world, sitting around, watching, d...",The World's Responsibility to Protect Abused C...,The author is expressing disgust at the fact t...,The text expresses frustration at the inaction...,Global Inaction on Child Abuse: A Call for Col...,"rest of the world, sitting around, watching, d...",6.333333,6.0,5.5,5.0,2.0,5.5,4.5,3.429,2.857,2.857,2.714,132.0,140.0,The World's Responsibility to Protect Abused C...,The World's Responsibility to Protect Abused C...,The World's Responsibility to Protect Abused C...,The World's Responsibility to Protect Abused C...
140,313.0,193.0,1.0,192.0,19.0,"I feel the asylum process is very, very bad in...","I feel the asylum process is very, very bad in...",train,1,6,2,32,35000,[Neutral],"[0, 0, 0, 0, 0, 1, 0, 0]",1.0,Syrians and Iraqis granted asylum in Germany f...,Syrians and Iraqis granted asylum in Germany f...,"I feel the asylum process is very, very bad in...",Syrians and Iraqis granted asylum in Germany f...,"[0.002763778902590275, -0.0021175541914999485,...",[Neutral],"asylum process, bad, countries, Germany, USA, ...",The Need for Improvement in the Asylum Process...,The author believes that the asylum process is...,The asylum process is considered poor in many ...,Inadequacies in the Asylum Process: A Global P...,"asylum process, bad, countries, Germany, USA, ...",5.833333,6.125,5.5,5.0,2.0,5.5,4.5,3.429,2.857,2.857,2.714,129.0,141.0,The Need for Improvement in the Asylum Process...,The Need for Improvement in the Asylum Process...,The Need for Improvement in the Asylum Process...,The Need for Improvement in the Asylum Process...
385,395.0,488.0,1.0,487.0,26.0,This is so sad and tragic. The most selfish th...,This is so sad and tragic. The most selfish th...,train,unknown,unknown,unknown,unknown,unknown,"[Anger, Sadness]","[1, 0, 0, 0, 0, 0, 1, 0]",2.0,"Wife Who Died Alongside Husband, Children in M...","Wife Who Died Alongside Husband, Children in M...",This is so sad and tragic. The most selfish th...,"Wife Who Died Alongside Husband, Children in M...","[-0.01548206340521574, -0.0008526873425580561,...","[Anger, Sadness]","sad, tragic, selfish, innocent lives, own wife...",The Tragic Consequences of Selfishness,The text discusses a tragic event where a man ...,The text expresses sadness and anger over a tr...,"""Tragic Loss of Innocent Lives: Anger and Pity...","sad, tragic, selfish, innocent lives, wife, ki...",7.0,7.0,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,136.0,148.0,The Tragic Consequences of Selfishness. The te...,The Tragic Consequences of Selfishness. The te...,The Tragic Consequences of Selfishness. The te...,The Tragic Consequences of Selfishness. The te...
261,148.0,335.0,1.0,334.0,24.0,This is pretty sad for the elephant. It obvio...,This is pretty sad for the elephant. It obviou...,train,2,7,1,38,42000,[Sadness],"[0, 0, 0, 0, 0, 0, 1, 0]",1.0,Help Demand That Nosey The Elephant Be Rescued...,Help Demand That Nosey The Elephant Be Rescued...,This is pretty sad for the elephant. It obviou...,Help Demand That Nosey The Elephant Be Rescued...,"[-0.016700077801942825, -0.02291671186685562, ...",[Sadness],"elephant, captivity, circuses, shows, inhumane...",The Inhumane Treatment of Elephants in Circuses,The text expresses sadness for elephants being...,The text expresses sadness for elephants being...,The Inhumane Treatment of Elephants in Circuse...,"sad, elephant, no say, matter, not okay, intel...",2.333333,1.0,7.0,3.5,6.5,5.5,6.5,3.429,2.714,2.571,3.857,108.0,100.0,The Inhumane Treatment of Elephants in Circuse...,The Inhumane Treatment of Elephants in Circuse...,The Inhumane Treatment of Elephants in Circuse...,The Inhumane Treatment of Elephants in Circuse...
713,92.0,402.0,2.0,901.0,8.0,This little boy died in a well while he was he...,This little boy died in a well while he was he...,train,2,6,1,62,29000,[Sadness],"[0, 0, 0, 0, 0, 0, 1, 0]",1.0,China: Boy trapped in well found dead after 4 ...,China: Boy trapped in well found dead after 4 ...,This little boy died in a well while he was he...,China: Boy trapped in well found dead after 4 ...,"[0.013278329744935036, -0.004298476502299309, ...",[Sadness],"little boy, died, well, father, harvest vegeta...",Tragic Death of a Little Boy in a Well,A little boy died in a well while helping his ...,A young boy tragically died in a well while he...,Tragic Death of Little Boy in a Well During Ve...,"little boy, died, well, helping, father, harve...",6.666667,5.0,6.5,2.5,1.0,6.5,2.0,3.571,4.0,2.143,4.429,134.0,132.0,Tragic Death of a Little Boy in a Well. A litt...,Tragic Death of a Little Boy in a Well. A litt...,Tragic Death of a Little Boy in a Well. A litt...,Tragic Death of a Little Boy in a Well. A litt...
9,,,,,,This is astonishing. I had no idea that some i...,This is astonishing. I had no idea that some i...,,,,,,,[Surprise],"[0, 0, 0, 0, 0, 0, 0, 1]",,,,This is astonishing. I had no idea that some i...,,,[Surprise],,,,,,,,,,,,,,,,,,,,,,,
276,116.0,353.0,1.0,352.0,53.0,This is just crazy that people are still dying...,This is just crazy that people are still dying...,train,2,3,1,27,25000,"[Anger, Sadness]","[1, 0, 0, 0, 0, 0, 1, 0]",2.0,Everyone agrees we need to fight cholera. No o...,Everyone agrees we need to fight cholera. No o...,This is just crazy that people are still dying...,Everyone agrees we need to fight cholera. No o...,"[0.01276214700192213, 0.007139384746551514, -0...","[Anger, Sadness]","medicine, save lives, epidemics, senseless, di...",The Need for Better Epidemic Preparedness and ...,The author expresses frustration that people a...,The text expresses frustration over people sti...,Inadequate Epidemic Preparedness and the Need ...,"crazy, people, dying, medicine, save lives, en...",6.0,6.0,6.0,6.0,6.0,6.0,6.0,4.0,2.286,3.571,3.714,130.0,140.0,The Need for Better Epidemic Preparedness and ...,The Need for Better Epidemic Preparedness and ...,The Need for Better Epidemic Preparedness and ...,The Need for Better Epidemic Preparedness and ...
306,66.0,388.0,1.0,387.0,33.0,It was a shame that there was a fire but at le...,It was a shame that there was a fire but at le...,train,1,4,5,33,36000,[Neutral],"[0, 0, 0, 0, 0, 1, 0, 0]",1.0,At least 10 hurt after massive Arizona apartme...,At least 10 hurt after massive Arizona apartme...,It was a shame that there was a fire but at le...,At least 10 hurt after massive Arizona apartme...,"[0.000277692946838215, -0.018580766394734383, ...",[Neutral],"fire, firefighters, preventing loss of life, s...",Gratitude for Life-Saving Efforts in a Fire In...,The text discusses a fire that occurred but th...,"A fire occurred, but fortunately, there were n...","""Firefighters Prevent Loss of Life in Tragic B...","shame, fire, did not die, much worse, firefigh...",4.166667,3.875,4.25,4.25,4.5,4.25,3.75,2.714,2.857,3.286,2.9285,119.0,123.0,Gratitude for Life-Saving Efforts in a Fire In...,Gratitude for Life-Saving Efforts in a Fire In...,Gratitude for Life-Saving Efforts in a Fire In...,Gratitude for Life-Saving Efforts in a Fire In...
623,242.0,292.0,2.0,791.0,19.0,The fact that civilians have to be hurt in the...,The fact that civilians have to be hurt in the...,train,1,6,2,32,35000,[Neutral],"[0, 0, 0, 0, 0, 1, 0, 0]",1.0,"Operation killed Afghan civilians, US military...","Operation killed Afghan civilians, US military...",The fact that civilians have to be hurt in the...,"Operation killed Afghan civilians, US military...","[-0.01901341788470745, 0.007733711041510105, 0...",[Neutral],"civilians, hurt, process, get rid, Taliban, in...",Minimizing Civilian Casualties in the Fight Ag...,The author is concerned about the harm caused ...,The text expresses concern about civilian harm...,Minimizing Civilian Casualties in the Fight Ag...,"civilians, hurt, process, Taliban, mind boggli...",6.166667,6.25,5.5,5.0,2.0,5.5,4.5,3.429,2.857,2.857,2.714,131.0,142.0,Minimizing Civilian Casualties in the Fight Ag...,Minimizing Civilian Casualties in the Fight Ag...,Minimizing Civilian Casualties in the Fight Ag...,Minimizing Civilian Casualties in the Fight Ag...
669,113.0,348.0,2.0,847.0,48.0,I don't have a major problem with limiting the...,I don't have a major problem with limiting the...,train,1,6,1,41,28000,[Anger],"[1, 0, 0, 0, 0, 0, 0, 0]",1.0,Environmental health officers call for smoking...,Environmental health officers call for smoking...,I don't have a major problem with limiting the...,Environmental health officers call for smoking...,"[0.013939978554844856, -0.012559131719172001, ...",[Anger],"limiting, areas, smoke, children, second-hand ...",The Absurdity of Banning Smoking in Public View,The author disagrees with limiting smoking are...,The author expresses their understanding of li...,"""Opposing Overreach in Public Smoking Restrict...","major problem, limiting areas, people can smok...",1.0,3.125,2.5,2.0,1.5,2.0,1.0,3.286,4.429,2.714,2.571,100.0,117.0,The Absurdity of Banning Smoking in Public Vie...,The Absurdity of Banning Smoking in Public Vie...,The Absurdity of Banning Smoking in Public Vie...,The Absurdity of Banning Smoking in Public Vie...


In [20]:
# X, y for training
df_train_exploded = df_train.explode(emotion_col).copy()
df_train_exploded['target'] = df_train_exploded[emotion_col].map( label2key )

print('\nTrain set b4 upsampling:\n', df_train_exploded[emotion_col].value_counts(), sep='')
df_train_exploded = upsample_all( df_train_exploded.copy(), labels_col='target', random_state=random_state )
print('\nTrain set after upsampling:\n', df_train_exploded[emotion_col].value_counts(), '\n\n', 
       df_train_exploded['target'].value_counts(), sep='')

X_train_exploded = df_train_exploded[text_col].values
y_train_exploded = df_train_exploded['target'].values
#df_train_exploded


Train set b4 upsampling:
Neutral     202
Sadness     202
Anger       118
Disgust      92
Joy          70
Surprise     68
Hope         63
Fear         58
Name: emotion_no_2nd_neut, dtype: int64

Train set after upsampling:
Hope        202
Surprise    202
Joy         202
Fear        202
Sadness     202
Disgust     202
Anger       202
Neutral     202
Name: emotion_no_2nd_neut, dtype: int64

3    202
7    202
4    202
2    202
6    202
1    202
0    202
5    202
Name: target, dtype: int64


In [21]:
# for testing on training set
X_train         = df_train[text_col].values
y_train_encoded = np.array( df_train['target_encoded'].values.tolist() )

# for testing on test set
X_dev          = df_dev[text_col].values
y_dev_encoded  = np.array( df_dev['target_encoded'].values.tolist() )

X_train_exploded, y_train_exploded = sklearn.utils.shuffle( X_train_exploded, y_train_exploded,
                                                            random_state=random_state, ) 
print( 'Shape of datasets: ', X_train_exploded.shape, y_train_exploded.shape, X_train.shape, y_train_encoded.shape,
                              X_dev.shape, y_dev_encoded.shape, )

Shape of datasets:  (1616,) (1616,) (776,) (776, 8) (208,) (208, 8)


# Train

In [22]:
clf_params_nb = {
    'alpha': 1.0,
    'fit_prior': True,
}

clf_params_rf = {
    'n_estimators': 100,
    'criterion': 'entropy',                         # “gini”, “entropy”
    'max_depth': None,
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'max_features': 'auto',                      # “auto”, “sqrt”, “log2”
    'class_weight': None,                        # dict, 'balanced', 'balanced_subsample', None
    'random_state': random_state,
    'n_jobs': -1,
}


clf_params_knn = {    
    'n_neighbors': 5,
    'weights': 'uniform',     # default=’uniform’, {‘uniform’, ‘distance’}
    'algorithm': 'auto',      # default=’auto’, {‘auto’, ‘ball_tree’, ‘kd_tree’, ‘brute’}
    'metric': 'minkowski',    # default=’minkowski’{ 'euclidean', 'cosine', } + sklearn.neighbors.VALID_METRICS['brute']
    'p': 2,                   # default=2, p for minkowski distance
    'n_jobs': -1,
}

In [23]:
clf_params_svm = {
    
    'C': 1.0,                      # default=1.0
    'kernel': 'rbf',               # default=’rbf’, {‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’}
    'degree': 3,                   # default=3, degree for polynomial f(x)
    'tol': 1e-3,                   # stopping criteria, default=1e-3
    'gamma': 'scale',               # default=’scale’, kernel coeff for ‘rbf’, ‘poly’ and ‘sigmoid’
                                   # 'scale' => 1 / (n_features * X.var()), ‘auto’ => 1 / n_features
    'coef0': 0.0,                  # default=0.0, independent term in kernel function in ‘poly’ and ‘sigmoid’
    'shrinking': True,             # default=True'
    'cache_size': 200,             # default=200,   size of the kernel cache (in MB)
    'decision_function_shape': 'ovr',    # default=’ovr’, {‘ovo’, ‘ovr’}, multiclass => always 'ovo'
    'break_ties': False,           # default=False, for decision_function_shape='ovr' and num classes>2 (longer)
    'max_iter': -1,                # default=-1,    limit on iterations
    'class_weight': 'balanced',          # default=None,  dict or ‘balanced'
    'probability': True,
    'verbose': 0,
    'random_state': random_state,

}

In [24]:
# full LR
clf_params_lr = {

    'C': 1.0,                      # default 0.1, inverse regularization strength, smaller => stronger regularization
    
    'solver': 'liblinear',         # default=’lbfgs’ {‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’},
    # small dataset => ‘liblinear’ big dataset => ‘sag’ and ‘saga’ (faster);
    # multiclass => ‘newton-cg’, ‘sag’, ‘saga’, ‘lbfgs’; ‘liblinear’ only for one-versus-rest
    # supported penalties by solver: ‘newton-cg’, ‘lbfgs’, ‘sag’ - [‘l2’, ‘none’], ‘liblinear’ - [‘l1’, ‘l2’],
    # ‘saga’ - [‘elasticnet’, ‘l1’, ‘l2’, ‘none’]

    'max_iter': 200,               # default=100, iters for solvers to converge    
    'penalty': 'l2',               # ‘l1’, ‘l2’, ‘elasticnet’ (both), ‘none’, default=’l2’ (not for al solvers)
    'dual': True,                 # default=False (dual formulation only for l2  with liblinear solver
                                   # Prefer dual=False when n_samples > n_features

    'tol': 1e-4,                   # stopping criteria, default=1e-4
    'fit_intercept': True,          # default True; whether to fit bias / interceptbe added to the decision function
    'intercept_scaling': 1,        # default=1, for solver ‘liblinear’ and self.fit_intercept=True (additional term)
    'class_weight': None,          # default=None, dict or ‘balanced'
        
    'multi_class': 'auto',         #  default=’auto’, {‘auto’, ‘ovr’, ‘multinomial’},
    # 'ovr’ => binary problem fit for each label
    # ‘multinomial’ => multinomial loss fit across entire prob distribution
    # ‘auto’ selects ‘ovr’ if the data is binary, or if solver=’liblinear’, and otherwise selects ‘multinomial’.

    'l1_ratio': None,
    # default = None, elastic-Net mixing param, [0,1], only for penalty='elasticnet'. l1_ratio=0 => penalty='l2',
    # l1_ratio=1 => penalty='l1', combination of L1 and L2 if in between
    
    'verbose': 0,
    'warm_start': False,    
    'n_jobs': -1,
    'random_state': random_state,

}

In [25]:
clf_params_xgb_word = {
        'n_estimators': 100,
        'max_depth': 3,          # 3 - 0.5489
        'learning_rate': 0.1,    #                            # eta
        'objective': 'multi:softmax',                         # multi:softmax, multi:softprob, rank:pairwise
        'eval_metric': 'merror',                              # multiclass - merror, mlogloss
        'base_score': 0.5,
        'booster': 'gbtree',                                  # gbtree, dart
        'tree_method': 'auto',                                # auto, exact, approx, hist and gpu_hist
        'importance_type': 'gain',                            # default“gain”,“weight”,“cover”,“total_gain”,“total_cover”
        'gamma': 0,                                           # larger - more conservative, [0, inf]
        'reg_alpha': 0,                                       # L1 reg., larger - more conservative
        'reg_lambda': 1,                                      # L2 rreg., larger - more conservative
        'sampling_method': 'uniform',                         # uniform, gradient_based
        'max_delta_step': 1,                                  # 1-10
        'min_child_weight': 1,
        'subsample': 0.9,           # 0.9  (0.5638, thres 0.21)     # 0-1    
        'colsample_bylevel': 1.0,   #0.55 (0.5741, thres 0.25)     # 0-1
        'colsample_bynode': 1.0,                                    # optimized for higher recall
        'colsample_bytree': 1.0,                                    # 0-1  
        'seed': 2,
        'num_class': 8,
        #'use_label_encoder': False,
        'random_state': random_state,
        'n_jobs': -1,    
}

vect_params = {
    'max_df': 0.45,    # 0.45 - 0.5285
    'min_df': 1,
    'analyzer': 'char_wb',
    'ngram_range': (1,7),
    'binary': True,
    'stop_words': stopwords_combined,
}

In [26]:
#clf = MultinomialNB( **clf_params_nb )
#clf = LogisticRegression( **clf_params_lr )
#clf = RandomForestClassifier( **clf_params_rf )
#clf = SVC( **clf_params_svm )
#clf = KNeighborsClassifier( **clf_params_knn )
clf = XGBClassifier( **clf_params_xgb_word )
#clf = RidgeClassifierCV()
#clf = RidgeClassifierProba()
#clf = AdaBoostClassifier()
#clf = MLPClassifier()
#clf = DecisionTreeClassifier()
#clf = LinearDiscriminantAnalysis()

#clf_calib   = CalibratedClassifierCV(clf, cv=5, method='sigmoid')

In [27]:
#vectorizer = TfidfVectorizer( **vect_params )
vectorizer = CountVectorizer( **vect_params )
model       = Pipeline( steps=[('vect', vectorizer), ('clf', clf)] )
model.fit(X_train_exploded, y_train_exploded)

In [28]:
y_pred_train_probas = model.predict_proba(X_train)
y_pred_dev_probas   = model.predict_proba(X_dev)

In [29]:
def convert_preds(pred_probas, threshold = 0.33):
    '''
        Convert predicted probabilities into a one-hot encoded binary list
        based on a threshold for the second class. First class is always argmax()
    '''
    y_pred_ = []
    for probas in pred_probas:
        sorted_idxs = np.argsort(probas)
        proba2      = probas[sorted_idxs[-2]]
        res_idxs    = sorted_idxs[-2:] if proba2 >= threshold else sorted_idxs[-1:]
        res = [0]*8
        for idx in res_idxs:
            res[idx] = 1
        y_pred_.append(res)
    return np.array(y_pred_)


def find_best_threshold(y_encoded, y_pred_probas):
    '''Return best threshold'''
    res_dev = []
    for i in range(0,101):
        threshold = i/100
        y_pred_encoded = convert_preds(y_pred_probas, threshold=threshold)
        clf_rep_dev = classification_report( y_encoded, y_pred_encoded, output_dict=True )
        res_dev.append([ clf_rep_dev['macro avg']['f1-score'], threshold ])
    return sorted(res_dev, reverse=True)[0]


_, threshold_train = find_best_threshold(y_train_encoded, y_pred_train_probas)
_, threshold_dev   = find_best_threshold(y_dev_encoded, y_pred_dev_probas)
print(f'Best train and test thresholds: {threshold_train}, {threshold_dev}')

y_pred_train_encoded = convert_preds(y_pred_train_probas, threshold=threshold_train)
y_pred_dev_encoded   = convert_preds(y_pred_dev_probas, threshold=threshold_dev)
labels = list(label2key.keys())
print('Labels:', labels)

Best train and test thresholds: 0.27, 0.21
Labels: ['Anger', 'Disgust', 'Fear', 'Hope', 'Joy', 'Neutral', 'Sadness', 'Surprise']


In [30]:
print('Vectorizer:\n', model['vect'], '\n', sep='')
print('Classifier:\n', model['clf'], '\n', sep='')

print('\nTRAINSET')
print( classification_report( y_train_encoded, y_pred_train_encoded, target_names=labels, digits=4 ) )
clf_rep_train = classification_report( y_train_encoded, y_pred_train_encoded, target_names=labels, output_dict=True )

print('DEVSET')
print( classification_report( y_dev_encoded, y_pred_dev_encoded, target_names=labels, digits=4 ) )
clf_rep_dev = classification_report( y_dev_encoded, y_pred_dev_encoded, target_names=labels, output_dict=True )

Vectorizer:
CountVectorizer(analyzer='char_wb', binary=True, max_df=0.45,
                ngram_range=(1, 7),
                stop_words=['enough', 'unable', 'often', 'con', 'wherewith',
                            'whence', 'them', 'thenceforth', 'been', 'anyhow',
                            'even', 'whither', 'everywhere', "how's", 'about',
                            'themselves', 'somehow', 'others', 'thereon',
                            'thou', "shouldn't", 'eleven', 'fill', 'somebody',
                            'under', 'forty', 'same', 'seemed', 'between',
                            "why's", ...])

Classifier:
XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1.0, colsample_bynode=1.0, colsample_bytree=1.0,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric='merror', feature_types=None, gamma=0, gpu_id=None,
              grow_policy=None, importance_type='gain',
              interactio

Initial results for text_col + ts are good (seem better than for text_col). Just ts is close (slightly lower)

## APPENDIX

In [45]:
def find_best_threshold2(y_dev_encoded, y_pred_dev_probas):
    '''Return all thresholds (for control)'''
    res_dev = []
    for i in range(0,101):
        threshold = i/100
        y_pred_dev_encoded = convert_preds(y_pred_dev_probas, threshold=threshold)
        clf_rep_dev = classification_report( y_dev_encoded, y_pred_dev_encoded, output_dict=True )
        res_dev.append([ clf_rep_dev['macro avg']['f1-score'], threshold ])
    return sorted(res_dev, reverse=True)

find_best_threshold2(y_dev_encoded, y_pred_dev_probas)

[[0.5637923782787123, 0.21],
 [0.5563234615633137, 0.2],
 [0.5512856529254565, 0.19],
 [0.5407239392861057, 0.26],
 [0.5398555344058867, 0.24],
 [0.537027565027565, 0.25],
 [0.5367097945620769, 0.22],
 [0.5352385614793703, 0.23],
 [0.533215532674898, 0.27],
 [0.5307408126220245, 0.18],
 [0.5302024711881123, 0.17],
 [0.5288625422846734, 0.3],
 [0.5286753256936744, 0.33],
 [0.5273190054121892, 0.29],
 [0.5267087842070023, 0.28],
 [0.5266782760849698, 0.14],
 [0.5255744970985825, 0.16],
 [0.5253721170625968, 0.32],
 [0.5247048943770255, 0.31],
 [0.5234815395096254, 0.15],
 [0.5216467671105833, 0.13],
 [0.5187339893222246, 0.34],
 [0.5180605706432189, 0.12],
 [0.5174366059461795, 0.11],
 [0.5140496682699256, 0.1],
 [0.5127879246075031, 0.36],
 [0.5127259145292336, 0.38],
 [0.5127259145292336, 0.37],
 [0.5109513308042719, 0.35],
 [0.5078373498668962, 0.09],
 [0.5066775274324594, 0.4],
 [0.5066775274324594, 0.39],
 [0.5065189140669202, 0.08],
 [0.4999494037177553, 0.07],
 [0.4987203884183880

In [95]:
res = []
#params = [i/100 for i in range(1,101)] #+ [i for i in range(150,501,25)]
#params = [2,3,4,5,6,7,8,9,10,11,15]
#params = [7,8,9,10,11,12,14,15]
params = [1]

for param in params:
    clf_params_xgb_word2 = {
        'n_estimators': 100,
        'max_depth': 3,          # 3 - 0.5489
        'learning_rate': 0.1,    #                            # eta
        'objective': 'multi:softmax',                         # multi:softmax, multi:softprob, rank:pairwise
        'eval_metric': 'merror',                              # multiclass - merror, mlogloss
        'base_score': 0.5,
        'booster': 'gbtree',                                  # gbtree, dart
        'tree_method': 'auto',                                # auto, exact, approx, hist and gpu_hist
        'importance_type': 'gain',                            # default“gain”,“weight”,“cover”,“total_gain”,“total_cover”
        'gamma': 0,                                           # larger - more conservative, [0, inf]
        'reg_alpha': 0,                                       # L1 reg., larger - more conservative
        'reg_lambda': 1,                                      # L2 rreg., larger - more conservative
        'sampling_method': 'uniform',                         # uniform, gradient_based
        'max_delta_step': 1,                                  # 1-10
        'min_child_weight': 1,
        'subsample': 0.9,           # 0.9  (0.5638, thres 0.21)     # 0-1    
        'colsample_bylevel': 1.0,   #0.55 (0.5741, thres 0.25)     # 0-1
        'colsample_bynode': 1.0,                                    # optimized for higher recall
        'colsample_bytree': 1.0,                                    # 0-1  
        'seed': 2,
        'num_class': 8,
        #'use_label_encoder': False,
        'random_state': random_state,
        'n_jobs': -1,    
}

    vect_params2 = {
        'max_df': 0.45,    # 0.45 - 0.5285
        'min_df': 1,
        'analyzer': 'char_wb',
        'ngram_range': (1,7),
        'binary': True,
        'stop_words': stopwords_combined,
    }

    clf        = XGBClassifier( **clf_params_xgb_word2 )
    vectorizer = CountVectorizer( **vect_params2 )
    #vectorizer = TfidfVectorizer( **vect_params2 )
    model      = Pipeline( steps=[('vect', vectorizer), ('clf', clf)] )
    model.fit(X_train_exploded, y_train_exploded)

    y_pred_train_probas = model.predict_proba(X_train)
    y_pred_dev_probas   = model.predict_proba(X_dev)
    
    _, threshold_train = find_best_threshold(y_train_encoded, y_pred_train_probas)
    _, threshold_dev   = find_best_threshold(y_dev_encoded, y_pred_dev_probas)
    print(f'Best train and test thresholds: {threshold_train}, {threshold_dev}\n')
    
    y_pred_train_encoded = convert_preds(y_pred_train_probas, threshold=threshold_train)
    y_pred_dev_encoded   = convert_preds(y_pred_dev_probas, threshold=threshold_dev)
    labels = list(label2key.keys())

    print('PARAM:', param)
    print('\nTRAINSET')
    print( classification_report( y_train_encoded, y_pred_train_encoded, target_names=labels, digits=4 ) )
    clf_rep1 = classification_report( y_train_encoded, y_pred_train_encoded, target_names=labels, output_dict=True )

    print('\nDEVSET')
    print( classification_report( y_dev_encoded, y_pred_dev_encoded, target_names=labels, digits=4 ) )
    clf_rep2 = classification_report( y_dev_encoded, y_pred_dev_encoded, target_names=labels, output_dict=True )    

    res.append(( clf_rep2['micro avg']['f1-score'], clf_rep2['macro avg']['f1-score'],
                 clf_rep1['micro avg']['f1-score'], clf_rep1['macro avg']['f1-score'], param, threshold_dev, ))
    print('\nBest macro F1 score:', round(sorted(res, key=lambda x: x[1], reverse=True)[0][1], 4) )
    print('\n', '='*77, '\n')   

Best train and test thresholds: 0.27, 0.21

PARAM: 1

TRAINSET
              precision    recall  f1-score   support

       Anger     0.9561    0.9237    0.9397       118
     Disgust     0.9880    0.8913    0.9371        92
        Fear     1.0000    1.0000    1.0000        58
        Hope     0.9683    0.9683    0.9683        63
         Joy     1.0000    1.0000    1.0000        70
     Neutral     0.9621    0.8826    0.9206       230
     Sadness     0.9385    0.8317    0.8819       202
    Surprise     1.0000    0.9853    0.9926        68

   micro avg     0.9680    0.9079    0.9370       901
   macro avg     0.9766    0.9354    0.9550       901
weighted avg     0.9673    0.9079    0.9362       901
 samples avg     0.9800    0.9442    0.9516       901


DEVSET
              precision    recall  f1-score   support

       Anger     0.4694    0.6053    0.5287        38
     Disgust     0.3947    0.6250    0.4839        24
        Fear     0.5000    0.6250    0.5556         8
       

In [96]:
for i in sorted(res, key=lambda x: x[1], reverse=True):
    print(i)

(0.6162162162162161, 0.5637923782787123, 0.9369988545246278, 0.9550211593522074, 1, 0.21)


In [None]:
# subsample alone
subsample2 = '''

'''


# colsample_bylevel when subsample = 0.87
colsample_bylevel2 = '''

'''

# colsample_bylevel when subsample = 0.7
colsample_bylevel2 = '''

'''

In [None]:
gamma = '''

'''

alpha = '''

'''

lambda1 = '''

'''

# Best results

### Feature col: essay_clean (sadness downsampled to neutral (202))¶
Using augmented data  
Macro F1 = 0.5638  
The spellchecked columns has slightly better results (Macro F1 = 0.574)

In [None]:
clf_params_xgb_word2 = {
        'n_estimators': 100,
        'max_depth': 3,          # 3 - 0.5489
        'learning_rate': 0.1,    #                            # eta
        'objective': 'multi:softmax',                         # multi:softmax, multi:softprob, rank:pairwise
        'eval_metric': 'merror',                              # multiclass - merror, mlogloss
        'base_score': 0.5,
        'booster': 'gbtree',                                  # gbtree, dart
        'tree_method': 'auto',                                # auto, exact, approx, hist and gpu_hist
        'importance_type': 'gain',                            # default“gain”,“weight”,“cover”,“total_gain”,“total_cover”
        'gamma': 0,                                           # larger - more conservative, [0, inf]
        'reg_alpha': 0,                                       # L1 reg., larger - more conservative
        'reg_lambda': 1,                                      # L2 rreg., larger - more conservative
        'sampling_method': 'uniform',                         # uniform, gradient_based
        'max_delta_step': 1,                                  # 1-10
        'min_child_weight': 1,
        'subsample': 0.9,           # 0.9  (0.5638, thres 0.21)     # 0-1    
        'colsample_bylevel': 1.0,   #0.55 (0.5741, thres 0.25)     # 0-1
        'colsample_bynode': 1.0,                                    # optimized for higher recall
        'colsample_bytree': 1.0,                                    # 0-1  
        'seed': 2,
        'num_class': 8,
        #'use_label_encoder': False,
        'random_state': random_state,
        'n_jobs': -1,    
}

    vect_params2 = {
        'max_df': 0.45,    # 0.45 - 0.5285
        'min_df': 1,
        'analyzer': 'char_wb',
        'ngram_range': (1,7),
        'binary': True,
        'stop_words': stopwords_combined,
    }
# Countvectorizer

```
TRAINSET
              precision    recall  f1-score   support

       Anger     0.9561    0.9237    0.9397       118
     Disgust     0.9880    0.8913    0.9371        92
        Fear     1.0000    1.0000    1.0000        58
        Hope     0.9683    0.9683    0.9683        63
         Joy     1.0000    1.0000    1.0000        70
     Neutral     0.9621    0.8826    0.9206       230
     Sadness     0.9385    0.8317    0.8819       202
    Surprise     1.0000    0.9853    0.9926        68

   micro avg     0.9680    0.9079    0.9370       901
   macro avg     0.9766    0.9354    0.9550       901
weighted avg     0.9673    0.9079    0.9362       901
 samples avg     0.9800    0.9442    0.9516       901

DEVSET
              precision    recall  f1-score   support

       Anger     0.4694    0.6053    0.5287        38
     Disgust     0.3947    0.6250    0.4839        24
        Fear     0.5000    0.6250    0.5556         8
        Hope     0.4737    0.5625    0.5143        16
         Joy     1.0000    0.5000    0.6667         2
     Neutral     0.4535    0.7222    0.5571        54
     Sadness     0.8000    0.7525    0.7755       101
    Surprise     0.2727    1.0000    0.4286         3

   micro avg     0.5534    0.6951    0.6162       246
   macro avg     0.5455    0.6741    0.5638       246
weighted avg     0.5975    0.6951    0.6317       246
 samples avg     0.6034    0.7163    0.6298       246
```

### Feature col: essay_clean_spellchecked (upsample to sadness (383))
Before augmented data was used  
Macro F1 = 0.5057

In [None]:
clf_params_xgb_word = {
    'n_estimators': 100,
    'max_depth': 6,
    'learning_rate': 0.1,                                 # eta
    'objective': 'multi:softmax',                         # multi:softmax, multi:softprob, rank:pairwise
    'eval_metric': 'merror',                              # multiclass - merror, mlogloss
    'base_score': 0.5,
    'booster': 'gbtree',                                  # gbtree, dart
    'tree_method': 'auto',                                # auto, exact, approx, hist and gpu_hist
    'importance_type': 'gain',                            # default“gain”,“weight”,“cover”,“total_gain”,“total_cover”
    'gamma': 0,                                           # larger - more conservative, [0, inf]
    'reg_alpha': 0,                                       # L1 reg., larger - more conservative
    'reg_lambda': 1,                                      # L2 rreg., larger - more conservative
    'sampling_method': 'uniform',                         # uniform, gradient_based
    'max_delta_step': 1,                                  # 1-10
    'min_child_weight': 1,
    'subsample': 0.7,                                     # 0-1    
    'colsample_bylevel': 0.45,  #0.45                            # 0-1
    'colsample_bynode': 1.0,  #0.45                             # optimized for higher recall
    'colsample_bytree': 1.0,                              # 0-1  
    'seed': 2,
    'num_class': 8,
    #'use_label_encoder': False,
    'random_state': random_state,
    'n_jobs': -1,    
}

vect_params = {
    'max_df': 1.0,
    'min_df': 1,
    'analyzer': 'char_wb',
    'ngram_range': (1,5),
    'binary': True,
    'stop_words': stopwords_combined,
}
# Countvectorizer

### Feature col: essay_clean_spellchecked (sadness downsampled to neutral (202))
Before augmented data was used  

In [None]:
clf_params_xgb_word2 = {
        'n_estimators': 145,
        'max_depth': 6,
        'learning_rate': 0.1,    # 0.3 is close too          # eta
        'objective': 'multi:softmax',                         # multi:softmax, multi:softprob, rank:pairwise
        'eval_metric': 'merror',                              # multiclass - merror, mlogloss
        'base_score': 0.5,
        'booster': 'gbtree',                                  # gbtree, dart
        'tree_method': 'auto',                                # auto, exact, approx, hist and gpu_hist
        'importance_type': 'gain',                            # default“gain”,“weight”,“cover”,“total_gain”,“total_cover”
        'gamma': 0,                                           # larger - more conservative, [0, inf]
        'reg_alpha': 0,                                       # L1 reg., larger - more conservative
        'reg_lambda': 1,                                      # L2 rreg., larger - more conservative
        'sampling_method': 'uniform',                         # uniform, gradient_based
        'max_delta_step': 1,                                  # 1-10
        'min_child_weight': 1,
        'subsample': 0.7,           # 0.7                                # 0-1    
        'colsample_bylevel': 0.28,   #0.28 (0.5342)                  # 0-1
        'colsample_bynode': 1.0,  #0.28 (0.5342, thres=0.26)      # optimized for higher recall
        'colsample_bytree': 1.0,                                    # 0-1  
        'seed': 2,
        'num_class': 8,
        #'use_label_encoder': False,
        'random_state': random_state,
        'n_jobs': -1,    
}

    vect_params2 = {
        'max_df': 1.0,
        'min_df': 1,
        'analyzer': 'char_wb',
        'ngram_range': (1,5),
        'binary': True,
        'stop_words': stopwords_combined,
    }   
clf        = XGBClassifier( **clf_params_xgb_word2 )
vectorizer = CountVectorizer( **vect_params2 )
model      = Pipeline( steps=[('vect', vectorizer), ('clf', clf)] )

```
Best train and test thresholds: 0.07, 0.23

PARAM: 1

TRAINSET
              precision    recall  f1-score   support

       Anger     0.9915    0.9915    0.9915       118
     Disgust     1.0000    1.0000    1.0000        92
        Fear     1.0000    1.0000    1.0000        33
        Hope     1.0000    1.0000    1.0000        32
         Joy     1.0000    1.0000    1.0000        10
     Neutral     0.9808    0.8870    0.9315       230
     Sadness     0.9851    0.9802    0.9826       202
    Surprise     1.0000    1.0000    1.0000        19

   micro avg     0.9888    0.9579    0.9731       736
   macro avg     0.9947    0.9823    0.9882       736
weighted avg     0.9885    0.9579    0.9725       736
 samples avg     0.9935    0.9749    0.9792       736


DEVSET
              precision    recall  f1-score   support

       Anger     0.4500    0.4737    0.4615        38
     Disgust     0.6154    0.6667    0.6400        24
        Fear     0.6667    0.5000    0.5714         8
        Hope     0.5385    0.4375    0.4828        16
         Joy     0.0000    0.0000    0.0000         2
     Neutral     0.4824    0.7593    0.5899        54
     Sadness     0.7788    0.8020    0.7902       101
    Surprise     1.0000    0.6667    0.8000         3

   micro avg     0.6123    0.6870    0.6475       246
   macro avg     0.5665    0.5382    0.5420       246
weighted avg     0.6241    0.6870    0.6474       246
 samples avg     0.6683    0.7139    0.6659       246


Best macro F1 score: 0.542
```

### Feature column: essay_clean_spellchecked_ts (sadness downsampled to neutral (202))
Before augmented data was used  
F1 macro never got over 0.47

In [None]:
clf_params_xgb_word = {
    'n_estimators': 125,
    'max_depth': 7,
    'learning_rate': 0.43,    # 0.3 is close too          # eta
    'objective': 'multi:softmax',                         # multi:softmax, multi:softprob, rank:pairwise
    'eval_metric': 'merror',                              # multiclass - merror, mlogloss
    'base_score': 0.5,
    'booster': 'gbtree',                                  # gbtree, dart
    'tree_method': 'auto',                                # auto, exact, approx, hist and gpu_hist
    'importance_type': 'gain',                            # default“gain”,“weight”,“cover”,“total_gain”,“total_cover”
    'gamma': 0.61,                                           # larger - more conservative, [0, inf]
    'reg_alpha': 0,                                       # L1 reg., larger - more conservative
    'reg_lambda': 1,                                      # L2 rreg., larger - more conservative
    'sampling_method': 'uniform',                         # uniform, gradient_based
    'max_delta_step': 1,                                  # 1-10
    'min_child_weight': 1,
    'subsample': 1.0,                                     # 0-1    
    'colsample_bylevel': 1.0,  #0.45                            # 0-1
    'colsample_bynode': 1.0,  #0.45                             # optimized for higher recall
    'colsample_bytree': 1.0,                              # 0-1  
    'seed': 2,
    'num_class': 8,
    #'use_label_encoder': False,
    'random_state': random_state,
    'n_jobs': -1,    
}
vect_params = {
    'max_df': 1.0,
    'min_df': 1,
    'analyzer': 'char_wb',
    'ngram_range': (1,4),
    'binary': True,
    'stop_words': stopwords_combined,
}

### Feature columns: tsk (sadness downsampled to neutral (202))
Before augmented data was used  
Never got over 0.45 before fine-tuning regularization params (word or char ngrams, Tfidf or Counvectorizer)

In [None]:
clf_params_xgb_word2 = {
    'n_estimators': 100,
    'max_depth': 6,
    'learning_rate': 0.32,    # 0.3 is close too          # eta
    'objective': 'multi:softmax',                         # multi:softmax, multi:softprob, rank:pairwise
    'eval_metric': 'merror',                              # multiclass - merror, mlogloss
    'base_score': 0.5,
    'booster': 'gbtree',                                  # gbtree, dart
    'tree_method': 'auto',                                # auto, exact, approx, hist and gpu_hist
    'importance_type': 'gain',                            # default“gain”,“weight”,“cover”,“total_gain”,“total_cover”
    'gamma': 0,                                           # larger - more conservative, [0, inf]
    'reg_alpha': 0,                                       # L1 reg., larger - more conservative
    'reg_lambda': 1,                                      # L2 rreg., larger - more conservative
    'sampling_method': 'uniform',                         # uniform, gradient_based
    'max_delta_step': 1,                                  # 1-10
    'min_child_weight': 1,
    'subsample': 1.0,                                     # 0-1    
    'colsample_bylevel': 1.0,  #0.45                            # 0-1
    'colsample_bynode': 1.0,  #0.45                             # optimized for higher recall
    'colsample_bytree': 1.0,                              # 0-1  
    'seed': 2,
    'num_class': 8,
    #'use_label_encoder': False,
    'random_state': random_state,
    'n_jobs': -1,    
}

    vect_params2 = {
        'max_df': 1.0,
        'min_df': 1,
        'analyzer': 'word',
        'ngram_range': (1,1),
        'binary': False,
        'stop_words': stopwords_combined,
    }