# EMO BASELINE
## The Association for Computational Linguistics
## WASSA 2023 Shared Task on Empathy Emotion and Personality Detection in Interactions
More details [here](https://codalab.lisn.upsaclay.fr/competitions/11167#learn_the_details)

In [541]:
import numpy as np
import pandas as pd
import sklearn
import re, os
import ftfy
import pycld2 as cld2
import time
from typing import List
from copy import deepcopy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier, RidgeClassifierCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import classification_report, multilabel_confusion_matrix

import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 400)
#os.path.join()

In [None]:
class RidgeClassifierCVProba(RidgeClassifierCV):
    '''
        This lets RidgeClassifierCV() output probabilities with predict_proba()
    '''
    def predict_proba(self, X):
        d = self.decision_function(X)
        return np.exp(d) / np.sum(np.exp(d))
    
class RidgeClassifierProba(RidgeClassifier):
    '''
        This lets RidgeClassifier() output probabilities with predict_proba()
    '''
    def predict_proba(self, X):
        d = self.decision_function(X)
        return np.exp(d) / np.sum(np.exp(d))

In [None]:
from sklearn.base import TransformerMixin

class DenseTransformer(TransformerMixin):

    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        return X.todense()
    
# model = Pipeline( steps=[('vect', vectorizer), ('to_dense', DenseTransformer()), ('clf', clf)] )

In [2]:
multi_spaces = re.compile('\s{2,}')

def clean_text(s):
    if not isinstance(s, str):
        return s
    for char in ['�', '•']:
        if char in s:
            s = s.replace(char, ' ')
    s = ftfy.fix_text(s)
    
    #s = clean.sub(' ', s.lower())
    s = multi_spaces.sub(' ', s)
        
    return s.strip()

In [3]:
def detect_lang( t ):
    '''
        Return the language(s) in string s.
        Naive Bayes classifier under the hood -
        results are less certain for strings that are too short.
        Returns up to three languages with confidence scores.
        More on usage: https://pypi.org/project/pycld2/
    '''
    _, _, details = cld2.detect( ftfy.fix_text( t ) )
    return details[0][0]

In [4]:
def get_target(emotions: List[str])->List[int]:
    '''
        Convert list of strings with categories into list of 0s and 1s with length 8 because there are 8 categories;
        1 in the i-th position means that this essay belongs to the i-th category as in key2label[i]
    '''
    res  = [0]*8
    idxs = [label2key[e] for e in emotions]    
    for idx in idxs:
        res[idx] = 1
    return res

In [5]:
# target variables
label2key = {   
    'Anger':    0,
    'Disgust':  1,
    'Fear':     2,
    'Hope':     3,    
    'Joy':      4,
    'Neutral':  5,
    'Sadness':  6,
    'Surprise': 7,
}
key2label = {v: k for k,v in label2key.items()}
print(key2label)

{0: 'Anger', 1: 'Disgust', 2: 'Fear', 3: 'Hope', 4: 'Joy', 5: 'Neutral', 6: 'Sadness', 7: 'Surprise'}


In [22]:
# new new version (Dec 2022)
def upsample_all( df_, labels_col='target', random_state=47 ):
    '''
        Upsample each class in column labels_col of pandas dataframe df_
        to the number of data points in majority class
    '''
    # get sub-dataframes for each class & max length
    labels = df_[labels_col].unique()
    dframes, df_lengths = dict(), dict()
    for i in labels:
        temp          = df_[ df_[labels_col] == i ]
        dframes[i]    = temp.copy()
        df_lengths[i] = len(temp)

    max_len = max( list(df_lengths.values()) )
    df_lengths = {k: max_len-v for k,v in df_lengths.items()}                     # difference - how many to resample

    # upsample with replacement to max length
    for i in labels:
        if df_lengths[i] == max_len:
            dframes[i] = dframes[i].sample( frac=1, random_state=random_state )      # we know it's overrepresented
        else:
            if len(dframes[i]) >= df_lengths[i]:
                replace = False                                                      # enough data points
            else:
                replace = True
            temp = dframes[i].sample( df_lengths[i], replace=replace, random_state=random_state )
            dframes[i] = pd.concat( [dframes[i].copy(), temp.copy()] )               # df len + (max_len-df len)
            dframes[i] = dframes[i].sample( frac=1, random_state=random_state )      # shuffle

    # combine and reshuffle
    df_merged = pd.concat( list(dframes.values()) )
    df_merged = df_merged.sample( frac=1, random_state=random_state ).reset_index(drop=True)

    return df_merged

In [6]:
random_state = 47

In [1734]:
words_8cats =      [ "'s", 'a', 'about', 'after', 'again', 'all', 'am', 'america', 'an', 'and', 'animal', 'animals',
                    'are', 'around', 'as', 'at', 'bad', 'be', 'because', 'but', 'by', 'can', 'children',
                    'crazy', 'death', 'do', 'even', 'find', 'for', 'from', 'get', 'go', 'had', 'has', 'have',
                    'having', 'he', 'his', 'horrible', 'how', 'i', 'if', 'in', 'is', 'it', 'its', 'just',
                    'kill', 'killed', 'know', 'like', 'live', 'life', 'lives', 'lived', 'm', 'make', 'makes',
                    'man', 'me', 'mind', 'more', 'most', 'much', 'my', 'need', 'never', 'no', 'not', 'now',
                    'of', 'on', 'one', 'or', 'other', 'out', 'people', 'place', 'put', 'really', 'sad', 'see',
                    'seems', 'situation', 'so', 'some', 'something', 'species', 'stop', 'story',
                    'such', 't', 'take', 'that', 'the', 'their', 'them', 'then', 'there', 'these', 'they', 'thing',
                    'things', 'think', 'this', 'time', 'to', 'type', 'up', 'us', 'very', 'war', 'was', 'way',
                    'we', 'were', 'what', 'when', 'with', 'worse', 'would', 'you',
                   ]

words_7cats      = [ 'age', 'air', 'also', 'always', 'any', 'article', 'attack', 'away', 'back', 'been', 'before',
                     'being', 'believe', 'both', 'cause', 'child', 'could', 'country', 'day', 'deal', 'did', 'die',
                     'disease', 'done', 'down', 'during', 'dying', 'each', 'either', 'end', 'facing', 'feel',
                     'felt', 'first', 'food', 'future', 'girl', 'glad', 'going', 'good', 'government', 'great',
                     'guess', 'happened', 'happening', 'hard', 'harm', 'hate', 'her', 'high', 'him', 'humans',
                     'imagine', 'instead', 'interesting', 'job', 'jobs', 'keep', 'kids', 'leave', 'left', 'let',
                     'life', 'living', 'lost', 'lot', 'make', 'many', 'needs', 'new', 'normal', 'often', 'oil',
                     'only', 'over', 'pain', 'person', 'places', 'poor', 'population', 'probably', 'problem',
                     'protect', 'read', 'reading', 'real', 'same', 'say', 'she', 'should', 'show', 'sick',
                     'society', 'someone', 'sounds', 'start', 'still', 'suffering', 'sure', 'terrible',
                     'thinking', 'those', 'though', 'thought', 'twice', 'under', 'water', 'were', 'where',
                     'which', 'who', 'whole', 'why', 'wildlife', 'will', 'woman', 'wonder', 'world', 'worried',
                     'years', 'your', ]

experimental_sw = words_7cats + words_8cats

## TRAIN

In [None]:
file1    = 'data/df_train.pkl'
df_train = pd.read_pickle(file1)

file2    = 'data/df_dev.pkl'
df_dev   = pd.read_pickle(file2)

In [28]:
# for training
df_train_exploded = df_train.explode('emotion').copy()
df_train_exploded['target'] = df_train_exploded['emotion'].map( label2key )

print('\nTrain set b4 upsampling:\n', df_train_exploded['emotion'].value_counts(), sep='')
df_train_exploded = upsample_all( df_train_exploded.copy(), labels_col='target', random_state=random_state )
print('\nTrain set after upsampling:\n', df_train_exploded['emotion'].value_counts(), '\n\n', 
       df_train_exploded['target'].value_counts(), sep='')

X_train_exploded = df_train_exploded['essay_clean'].values
y_train_exploded = df_train_exploded['target'].values
del df_train_exploded


Train set b4 upsampling:
Sadness     383
Neutral     240
Anger       124
Disgust     100
Fear         33
Hope         32
Surprise     19
Joy          10
Name: emotion, dtype: int64

Train set after upsampling:
Surprise    383
Anger       383
Sadness     383
Disgust     383
Neutral     383
Hope        383
Joy         383
Fear        383
Name: emotion, dtype: int64

7    383
0    383
6    383
1    383
5    383
3    383
4    383
2    383
Name: target, dtype: int64


In [29]:
# for testing on training set
X_train         = df_train['essay_clean'].values
y_train_encoded = np.array( df_train['target_encoded'].values.tolist() )

# for testing on test set
X_dev          = df_dev['essay_clean'].values
y_dev_encoded  = np.array( df_dev['target_encoded'].values.tolist() )

X_train_exploded, y_train_exploded = sklearn.utils.shuffle( X_train_exploded, y_train_exploded,
                                                            random_state=random_state, ) 
print( 'Shape of datasets: ', X_train_exploded.shape, y_train_exploded.shape, X_train.shape, y_train_encoded.shape,
                              X_dev.shape, y_dev_encoded.shape, )

Shape of datasets:  (3064,) (3064,) (792,) (792, 8) (207,) (207, 8)


In [1774]:
clf_params_nb = {
    'alpha': 1.0,
    'fit_prior': True,
}
clf_params_lr = {
    'C': 1.0,
    'solver': 'liblinear',
    'penalty': 'l2',
    'max_iter': 500,
    'random_state': random_state,
}

In [1766]:
clf_params_rf = {
    'n_estimators': 100,
    'criterion': 'entropy',                         # “gini”, “entropy”
    'max_depth': None,
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'max_features': 'auto',                      # “auto”, “sqrt”, “log2”
    'class_weight': None,                        # dict, 'balanced', 'balanced_subsample', None
    'random_state': random_state,
    'n_jobs': -1,
}

clf_params_svm = {
    
    'C': 1.0,                      # default=1.0
    'kernel': 'rbf',               # default=’rbf’, {‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’}
    'degree': 3,                   # default=3, degree for polynomial f(x)
    'tol': 1e-3,                   # stopping criteria, default=1e-3
    'gamma': 'scale',               # default=’scale’, kernel coeff for ‘rbf’, ‘poly’ and ‘sigmoid’
                                   # 'scale' => 1 / (n_features * X.var()), ‘auto’ => 1 / n_features
    'coef0': 0.0,                  # default=0.0, independent term in kernel function in ‘poly’ and ‘sigmoid’
    'shrinking': True,             # default=True'
    'cache_size': 200,             # default=200,   size of the kernel cache (in MB)
    'decision_function_shape': 'ovr',    # default=’ovr’, {‘ovo’, ‘ovr’}, multiclass => always 'ovo'
    'break_ties': False,           # default=False, for decision_function_shape='ovr' and num classes>2 (longer)
    'max_iter': -1,                # default=-1,    limit on iterations
    'class_weight': 'balanced',          # default=None,  dict or ‘balanced'
    'probability': True,
    'verbose': 0,
    'random_state': random_state,

}

clf_params_knn = {    
    'n_neighbors': 15,
    'weights': 'uniform',     # default=’uniform’, {‘uniform’, ‘distance’}
    'algorithm': 'auto',      # default=’auto’, {‘auto’, ‘ball_tree’, ‘kd_tree’, ‘brute’}
    'metric': 'minkowski',    # default=’minkowski’{ 'euclidean', 'cosine', } + sklearn.neighbors.VALID_METRICS['brute']
    'p': 2,                   # default=2, p for minkowski distance
    'n_jobs': -1,
}

In [1814]:
clf_params_xgb_word = {
    'n_estimators': 500,
    'max_depth': 6,
    'learning_rate': 0.2,                                 # eta
    'objective': 'multi:softmax',                         # multi:softmax, multi:softprob, rank:pairwise
    'eval_metric': 'merror',                              # multiclass - merror, mlogloss
    'base_score': 0.5,
    'booster': 'gbtree',                                  # gbtree, dart
    'tree_method': 'auto',                                # auto, exact, approx, hist and gpu_hist
    'importance_type': 'gain',                            # default“gain”,“weight”,“cover”,“total_gain”,“total_cover”
    'gamma': 0,                                           # larger - more conservative, [0, inf]
    'reg_alpha': 0,                                       # L1 reg., larger - more conservative
    'reg_lambda': 1,                                      # L2 rreg., larger - more conservative
    'sampling_method': 'uniform',                         # uniform, gradient_based
    'max_delta_step': 1,                                  # 1-10
    'min_child_weight': 1,
    'subsample': 1.0,                                     # 0-1    
    'colsample_bylevel': 1.0,                             # 0-1
    'colsample_bynode': 1.0,                              # optimized for higher recall
    'colsample_bytree': 1.0,                              # 0-1  
    'seed': 2,
    'num_class': 8,
    #'use_label_encoder': False,
    'random_state': random_state,
    'n_jobs': -1,    
}

In [1815]:
vect_params = {
    'max_df': 1.0,
    'min_df': 2,
    'analyzer': 'word',
    'ngram_range': (1,3),
    'binary': False,
    'stop_words': experimental_sw,
}

In [1816]:
#clf = MultinomialNB( **clf_params_nb )
#clf = LogisticRegression( **clf_params_lr )
#clf = RandomForestClassifier( **clf_params_rf )
#clf = SVC( **clf_params_svm )
#clf = KNeighborsClassifier( **clf_params_knn )
clf = XGBClassifier( **clf_params_xgb_word )
#clf = RidgeClassifierCVProba()
#clf = RidgeClassifierProba()
#clf = AdaBoostClassifier()
#clf = MLPClassifier()
#clf = DecisionTreeClassifier()
#clf = LinearDiscriminantAnalysis()

#clf_calib   = CalibratedClassifierCV(clf, cv=5, method='sigmoid')

In [1817]:
vectorizer = TfidfVectorizer( **vect_params )
#vectorizer = CountVectorizer( **vect_params )
model       = Pipeline( steps=[('vect', vectorizer), ('clf', clf)] )
model.fit(X_train_exploded, y_train_exploded)

Pipeline(steps=[('vect',
                 TfidfVectorizer(min_df=2, ngram_range=(1, 3),
                                 stop_words=['age', 'air', 'also', 'always',
                                             'any', 'article', 'attack', 'away',
                                             'back', 'been', 'before', 'being',
                                             'believe', 'both', 'cause',
                                             'child', 'could', 'country', 'day',
                                             'deal', 'did', 'die', 'disease',
                                             'done', 'down', 'during', 'dying',
                                             'each', 'either', 'end', ...])),
                ('clf',
                 XGBClassifier(base_score=0....
                               feature_types=None, gamma=0, gpu_id=None,
                               grow_policy=None, importance_type='gain',
                               interaction_constraints=None, learn

In [1818]:
y_pred_train_probas = model.predict_proba(X_train)
y_pred_dev_probas   = model.predict_proba(X_dev)

In [1819]:
def convert_preds(pred_probas, threshold = 0.33):
    '''
        Convert predicted probabilities into a one-hot encoded binary list
        based on a threshold for the second class. First class is always argmax()
    '''
    y_pred_ = []
    for probas in pred_probas:
        sorted_idxs = np.argsort(probas)
        proba2      = probas[sorted_idxs[-2]]
        res_idxs    = sorted_idxs[-2:] if proba2 >= threshold else sorted_idxs[-1:]
        res = [0]*8
        for idx in res_idxs:
            res[idx] = 1
        y_pred_.append(res)
    return np.array(y_pred_)

threshold_train, threshold_dev = 0.16, 0.15
y_pred_train_encoded = convert_preds(y_pred_train_probas, threshold=threshold_train)
y_pred_dev_encoded   = convert_preds(y_pred_dev_probas, threshold=threshold_dev)
labels = list(label2key.keys())
print('Labels:', labels)

Labels: ['Anger', 'Disgust', 'Fear', 'Hope', 'Joy', 'Neutral', 'Sadness', 'Surprise']


In [1820]:
print('Vectorizer:\n', model['vect'], '\n', sep='')
print('Classifier:\n', model['clf'], '\n', sep='')

print('\nTRAINSET')
print( classification_report( y_train_encoded, y_pred_train_encoded, target_names=labels, digits=4 ) )
clf_rep_train = classification_report( y_train_encoded, y_pred_train_encoded, target_names=labels, output_dict=True )

print('DEVSET')
print( classification_report( y_dev_encoded, y_pred_dev_encoded, target_names=labels, digits=4 ) )
clf_rep_dev = classification_report( y_dev_encoded, y_pred_dev_encoded, target_names=labels, output_dict=True )

Vectorizer:
TfidfVectorizer(min_df=2, ngram_range=(1, 3),
                stop_words=['age', 'air', 'also', 'always', 'any', 'article',
                            'attack', 'away', 'back', 'been', 'before', 'being',
                            'believe', 'both', 'cause', 'child', 'could',
                            'country', 'day', 'deal', 'did', 'die', 'disease',
                            'done', 'down', 'during', 'dying', 'each', 'either',
                            'end', ...])

Classifier:
XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1.0, colsample_bynode=1.0, colsample_bytree=1.0,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric='merror', feature_types=None, gamma=0, gpu_id=None,
              grow_policy=None, importance_type='gain',
              interaction_constraints=None, learning_rate=0.2, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None, max_delta

In [1715]:
mask = df_train['emotion'].apply(lambda x: 'Joy' in x)
df_train[mask]['essay_clean'].values[:10]

array(["The sights and sounds of war are horrible and I would never blame a person for having PTSD. I'm happy that the one solider although he has PTSD, he is working though it and wants to show people that now all people with PTSD are dangerous and there is away to heal yourself after being in a such bad of a place.",
       "It is nice for her to share her journey with cancer. I like how both of them connected with each other. I know that cancer is a sad thing and I'm happy they she found a way to change how she thinks about herself and thinks about life. It was interesting to see how she thought about her husband during that time.",
       "I am writing as regards my thoughts for Polar bears, the poster-child for climate change, are among the animals most affected by the seasonal and year-to-year changes in Arctic sea ice, because they rely on this surface for essential activities such as hunting, traveling and breeding. The researchers recommend that the National Climate Assessment

In [1716]:
# is this joy?
mask = df_dev['emotion'].apply(lambda x: 'Joy' in x)
df_dev[mask]['essay_clean'].values

array(["You would never even realize how relevant world hunger is until you read articles like this. We are so lucky in the US to not have to worry about kids going hungry. Most schools are starting to offer free lunches to kids too here in the US that they are really trying to combat it but we aren't doing much for other countries. There isn't always a whole lot we can do for other countries but there could be some things done to improve world hunger.",
       "It's really uplifting that people would notice animals suffering in a zoo and would plan a really complicated mission to save them. A mission where you need to have diplomatic talks with three different countries!! It really shows that some special people out there won't pass the buck to someone else, expecting someone else to do the heavy lifting to save something. I was really inspired by this article and I hope they end up setting up that animal center in Gaza."],
      dtype=object)

## APPENDIX

In [1805]:
# BEST THRESHOLD - DEV SET
res_dev = []
for i in range(1,100):
    threshold = i/100
    y_pred_dev_encoded = convert_preds(y_pred_dev_probas, threshold=threshold)
    clf_rep_dev = classification_report( y_dev_encoded, y_pred_dev_encoded, target_names=labels, output_dict=True )
    res_dev.append([ clf_rep_dev['macro avg']['f1-score'],  clf_rep_dev['micro avg']['f1-score'],
                     clf_rep_dev['macro avg']['precision'],  clf_rep_dev['micro avg']['precision'],
                     clf_rep_dev['macro avg']['recall'],  clf_rep_dev['micro avg']['recall'], threshold
                   ])

In [1806]:
sorted( res_dev, key=lambda x: x[0], reverse=True )[:10]

[[0.28186366095240345,
  0.49768160741885625,
  0.2725099280486544,
  0.40049751243781095,
  0.34609161793372323,
  0.6571428571428571,
  0.01],
 [0.2795937319841033,
  0.5073170731707317,
  0.2807787442462365,
  0.42162162162162165,
  0.3290198586744639,
  0.636734693877551,
  0.06],
 [0.2794475029722674,
  0.5064516129032259,
  0.28012453519419467,
  0.4186666666666667,
  0.33133467348927875,
  0.6408163265306123,
  0.05],
 [0.27914180131808425,
  0.5047619047619049,
  0.2777598644790144,
  0.412987012987013,
  0.33596430311890835,
  0.6489795918367347,
  0.03],
 [0.27849088368297914,
  0.504823151125402,
  0.27797311907219024,
  0.41644562334217505,
  0.33133467348927875,
  0.6408163265306123,
  0.04],
 [0.2784144591292985,
  0.5023547880690739,
  0.27663605439557337,
  0.40816326530612246,
  0.33827911793372323,
  0.6530612244897959,
  0.02],
 [0.26986750917605873,
  0.5057471264367815,
  0.26958821057260285,
  0.4230769230769231,
  0.31889254385964916,
  0.6285714285714286,
  0.07

In [1744]:
# BEST THRESHOLD - TRAINING SET
res_train = []
for i in range(1,100):
    threshold = i/100
    y_pred_train_encoded = convert_preds(y_pred_train_probas, threshold=threshold)
    clf_rep_train = classification_report( y_train_encoded, y_pred_train_encoded,
                                           target_names=labels, output_dict=True, )
    res_train.append([ clf_rep_train['macro avg']['f1-score'],  clf_rep_train['micro avg']['f1-score'],
                       clf_rep_train['macro avg']['precision'], clf_rep_train['micro avg']['precision'],
                       clf_rep_train['macro avg']['recall'],    clf_rep_train['micro avg']['recall'], threshold
                   ])

In [1745]:
sorted( res_train, key=lambda x: x[0], reverse=True )[:10]

[[0.8584503074031804,
  0.8218872138470129,
  0.8501414624283001,
  0.8658823529411764,
  0.88252710544232,
  0.7821466524973433,
  0.3],
 [0.8578071486368464,
  0.8113098672821697,
  0.8707209577306073,
  0.8876262626262627,
  0.8607072949606824,
  0.7470775770456961,
  0.43],
 [0.8578071486368464,
  0.8113098672821697,
  0.8707209577306073,
  0.8876262626262627,
  0.8607072949606824,
  0.7470775770456961,
  0.44],
 [0.8578071486368464,
  0.8113098672821697,
  0.8707209577306073,
  0.8876262626262627,
  0.8607072949606824,
  0.7470775770456961,
  0.45],
 [0.8578071486368464,
  0.8113098672821697,
  0.8707209577306073,
  0.8876262626262627,
  0.8607072949606824,
  0.7470775770456961,
  0.46],
 [0.8578071486368464,
  0.8113098672821697,
  0.8707209577306073,
  0.8876262626262627,
  0.8607072949606824,
  0.7470775770456961,
  0.47],
 [0.8578071486368464,
  0.8113098672821697,
  0.8707209577306073,
  0.8876262626262627,
  0.8607072949606824,
  0.7470775770456961,
  0.48],
 [0.857807148636

In [None]:
# best XGB
clf_params_xgb_word = {
    'n_estimators': 100,
    'max_depth': 5,
    'learning_rate': 0.1,                                 # eta
    'objective': 'multi:softmax',                         # multi:softmax, multi:softprob, rank:pairwise
    'eval_metric': 'merror',                              # multiclass - merror, mlogloss
    'base_score': 0.5,
    'booster': 'gbtree',                                  # gbtree, dart
    'tree_method': 'auto',                                # auto, exact, approx, hist and gpu_hist
    'importance_type': 'gain',                            # default“gain”,“weight”,“cover”,“total_gain”,“total_cover”
    'gamma': 0,                                           # larger - more conservative, [0, inf]
    'reg_alpha': 0,                                       # L1 reg., larger - more conservative
    'reg_lambda': 1,                                      # L2 rreg., larger - more conservative
    'sampling_method': 'uniform',                         # uniform, gradient_based
    'max_delta_step': 1,                                  # 1-10
    'min_child_weight': 1,
    'subsample': 0.7,                                     # 0-1    
    'colsample_bylevel': 1.0,                             # 0-1
    'colsample_bynode': 1.0,                              # optimized for higher recall
    'colsample_bytree': 0.9,                              # 0-1  
    'seed': 2,
    'num_class': 8,
    #'use_label_encoder': False,
    'random_state': random_state,
    'n_jobs': -1,    
}

vect_params = {
    'max_df': 1.0,
    'min_df': 1,
    'analyzer': 'char_wb',
    'ngram_range': (1,5),
    'binary': True,
    'stop_words': 'english',
}

CountVectorizer()

In [None]:
# full LR
clf_params_lr2 = {

    'C': 1.0,                      # default 0.1, inverse regularization strength, smaller => stronger regularization
    
    'solver': 'liblinear',         # default=’lbfgs’ {‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’},
    # small dataset => ‘liblinear’ big dataset => ‘sag’ and ‘saga’ (faster);
    # multiclass => ‘newton-cg’, ‘sag’, ‘saga’, ‘lbfgs’; ‘liblinear’ only for one-versus-rest
    # supported penalties by solver: ‘newton-cg’, ‘lbfgs’, ‘sag’ - [‘l2’, ‘none’], ‘liblinear’ - [‘l1’, ‘l2’],
    # ‘saga’ - [‘elasticnet’, ‘l1’, ‘l2’, ‘none’]

    'max_iter': 200,               # default=100, iters for solvers to converge    
    'penalty': 'l2',               # ‘l1’, ‘l2’, ‘elasticnet’ (both), ‘none’, default=’l2’ (not for al solvers)
    'dual': True,                 # default=False (dual formulation only for l2  with liblinear solver
                                   # Prefer dual=False when n_samples > n_features

    'tol': 1e-4,                   # stopping criteria, default=1e-4
    'fit_intercept': True,          # default True; whether to fit bias / interceptbe added to the decision function
    'intercept_scaling': 1,        # default=1, for solver ‘liblinear’ and self.fit_intercept=True (additional term)
    'class_weight': None,          # default=None, dict or ‘balanced'
        
    'multi_class': 'auto',         #  default=’auto’, {‘auto’, ‘ovr’, ‘multinomial’},
    # 'ovr’ => binary problem fit for each label
    # ‘multinomial’ => multinomial loss fit across entire prob distribution
    # ‘auto’ selects ‘ovr’ if the data is binary, or if solver=’liblinear’, and otherwise selects ‘multinomial’.

    'l1_ratio': None,
    # default = None, elastic-Net mixing param, [0,1], only for penalty='elasticnet'. l1_ratio=0 => penalty='l2',
    # l1_ratio=1 => penalty='l1', combination of L1 and L2 if in between
    
    'verbose': 0,
    'warm_start': False,    
    'n_jobs': -1,
    'random_state': random_state,

}

In [1823]:
for i, j in df_dev[['essay_clean', 'emotion']].values:
    print(i)
    print(j, '\n')

How sad is it that this kind of pain and suffering, and those kind of living conditions still exsist today? what a gap we have in society between developed countries and those that aren't. It's crazy to drive around the US and see all the money people spend on pointless things, and then to think about how the people in Haiti are living.
['Sadness'] 

The article is kind of tragic and hits close to home as I am the son of Haitian immigrants. Haiti has a lot of problems that only become exaggerated during natural disasters. I think what the Haitian people really need from the international community is help developing infrastructure so they can address these issues themselves. Foreign aid only acts as a band aid.
['Sadness'] 

I think that these kinds of stories, are sad, yet inspirational and leave you with kind of a good feeling. Even though his story is sad, it's cool and inspiring/motivational to see that he rose up against his circumstances. That he worked hard to make something of 