# EMPATHY TRACK - BASELINE REGRESSOR
## ACL 2023 Conference
## WASSA 2023 Shared Task on Empathy, Emotion, and Personality Detection in Interactions
More details [here](https://codalab.lisn.upsaclay.fr/competitions/11167#learn_the_details)

In [1]:
import numpy as np
import pandas as pd
import sklearn
import re, os
import ftfy
import pycld2 as cld2
import time
from typing import List
from copy import deepcopy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import classification_report, multilabel_confusion_matrix

import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 400)

import warnings
warnings.filterwarnings("ignore")

#os.path.join()

In [2]:
random_state = 47

In [3]:
words_8cats =      [ "'s", 'a', 'about', 'after', 'again', 'all', 'am', 'america', 'an', 'and', 'animal', 'animals',
                    'are', 'around', 'as', 'at', 'bad', 'be', 'because', 'but', 'by', 'can', 'children',
                    'crazy', 'death', 'do', 'even', 'find', 'for', 'from', 'get', 'go', 'had', 'has', 'have',
                    'having', 'he', 'his', 'horrible', 'how', 'i', 'if', 'in', 'is', 'it', 'its', 'just',
                    'kill', 'killed', 'know', 'like', 'live', 'life', 'lives', 'lived', 'm', 'make', 'makes',
                    'man', 'me', 'mind', 'more', 'most', 'much', 'my', 'need', 'never', 'no', 'not', 'now',
                    'of', 'on', 'one', 'or', 'other', 'out', 'people', 'place', 'put', 'really', 'sad', 'see',
                    'seems', 'situation', 'so', 'some', 'something', 'species', 'stop', 'story',
                    'such', 't', 'take', 'that', 'the', 'their', 'them', 'then', 'there', 'these', 'they', 'thing',
                    'things', 'think', 'this', 'time', 'to', 'type', 'up', 'us', 'very', 'war', 'was', 'way',
                    'we', 'were', 'what', 'when', 'with', 'worse', 'would', 'you',
                   ]

words_7cats      = [ 'age', 'air', 'also', 'always', 'any', 'article', 'attack', 'away', 'back', 'been', 'before',
                     'being', 'believe', 'both', 'cause', 'child', 'could', 'country', 'day', 'deal', 'did', 'die',
                     'disease', 'done', 'down', 'during', 'dying', 'each', 'either', 'end', 'facing', 'feel',
                     'felt', 'first', 'food', 'future', 'girl', 'glad', 'going', 'good', 'government', 'great',
                     'guess', 'happened', 'happening', 'hard', 'harm', 'hate', 'her', 'high', 'him', 'humans',
                     'imagine', 'instead', 'interesting', 'job', 'jobs', 'keep', 'kids', 'leave', 'left', 'let',
                     'life', 'living', 'lost', 'lot', 'make', 'many', 'needs', 'new', 'normal', 'often', 'oil',
                     'only', 'over', 'pain', 'person', 'places', 'poor', 'population', 'probably', 'problem',
                     'protect', 'read', 'reading', 'real', 'same', 'say', 'she', 'should', 'show', 'sick',
                     'society', 'someone', 'sounds', 'start', 'still', 'suffering', 'sure', 'terrible',
                     'thinking', 'those', 'though', 'thought', 'twice', 'under', 'water', 'were', 'where',
                     'which', 'who', 'whole', 'why', 'wildlife', 'will', 'woman', 'wonder', 'world', 'worried',
                     'years', 'your', ]

experimental_sw = words_7cats + words_8cats

In [4]:
# COMMON STOPWORDS
from sklearn.feature_extraction import _stop_words    
from nltk.corpus import stopwords                    
 
print('Sklearn:')
stopwords_sklearn = list(_stop_words.ENGLISH_STOP_WORDS)        # 318 words
print(len(stopwords_sklearn))
print(stopwords_sklearn)

print('\nNLTK:')
stopwords_nltk = list(stopwords.words('english'))              # 180 words
print(len(stopwords_nltk))
print(stopwords_nltk)

print('\nLemur')                                               # 430 words
stopwords_lemur = []
with open('data/lemur_stopwords.txt') as f:
    for line in f:
        line = line.strip()
        stopwords_lemur.append(line)
print(len(stopwords_lemur))
print(stopwords_lemur)

print('\nOther:')                                              # 153 words
stopwords_other = [ "i", "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]
print(len(stopwords_other))
print(stopwords_other)

print('\nCOMBINED:')                                           # 579 words
stopwords_combined = list(set(stopwords_sklearn + stopwords_nltk + stopwords_lemur + stopwords_other))
print(len(stopwords_combined))
print(stopwords_combined)

Sklearn:
318
['a', 'cant', 'inc', 'hasnt', 'else', 'call', 'or', 'former', 'what', 'ours', 'between', 'toward', 'mine', 'each', 'so', 'when', 'nowhere', 'thereupon', 'otherwise', 'onto', 'whereas', 'ltd', 'thereby', 'found', 'side', 'until', 'from', 'find', 'only', 'thence', 'thru', 'thin', 'me', 'now', 'get', 'yours', 'anyhow', 'also', 'con', 'to', 'are', 'and', 'anything', 'sixty', 'why', 'seeming', 'as', 'him', 'take', 'eight', 'elsewhere', 'ten', 'an', 'where', 'down', 'had', 'one', 'put', 'except', 'afterwards', 'fill', 'even', 'thus', 'under', 'over', 'that', 'those', 'full', 'up', 'which', 'nevertheless', 'yet', 'towards', 'with', 'third', 'into', 'be', 'hereafter', 'among', 'well', 'nothing', 'none', 'any', 'empty', 'see', 'twenty', 'become', 'here', 'was', 'cannot', 'hereupon', 'since', 'first', 'because', 'somewhere', 'them', 'most', 'it', 'i', 'last', 'against', 'per', 'he', 'is', 'sometime', 'four', 'nor', 'seemed', 'two', 'namely', 'cry', 'were', 'whatever', 'if', 'another

# Load and prepare data

In [5]:
file1    = 'data/df_train.pkl'
df_train = pd.read_pickle(file1)

file2    = 'data/df_dev.pkl'
df_dev   = pd.read_pickle(file2)

print(df_train.shape, df_dev.shape)

(792, 41) (208, 54)


In [6]:
# prepare additional text columns (tsk = title, summary, keywords)
df_train['essay_clean_tsk'] = df_train['gpt4_title'] + '. ' + df_train['gpt4_summary'] + ' ' +\
                                           df_train['gpt4_keywords'] + ' ' +\
                                           df_train['essay_clean']
df_train['essay_clean_ts']  = df_train['gpt4_title'] + '. ' + df_train['gpt4_summary'] + ' ' +\
                                           df_train['essay_clean']
df_train['title_summary_keywords']       = df_train['gpt4_title'] + '. ' + df_train['gpt4_summary'] + ' ' +\
                                           df_train['gpt4_keywords']
df_train['title_summary']                = df_train['gpt4_title'] + '. ' + df_train['gpt4_summary']



df_dev['essay_clean_tsk'] = df_dev['gpt4_title'] + '. ' + df_dev['gpt4_summary'] + ' ' +\
                                         df_dev['gpt4_keywords'] + ' ' +\
                                         df_dev['essay_clean']
df_dev['essay_clean_ts']  = df_dev['gpt4_title'] + '. ' + df_dev['gpt4_summary'] + ' ' +\
                                         df_dev['essay_clean']
df_dev['title_summary_keywords']       = df_dev['gpt4_title'] + '. ' + df_dev['gpt4_summary'] + ' ' +\
                                         df_dev['gpt4_keywords']
df_dev['title_summary']                = df_dev['gpt4_title'] + '. ' + df_dev['gpt4_summary']

In [7]:
# verify prepared text columns
temp = df_train[['essay_clean', 'essay_clean_tsk',
       'essay_clean_ts', 'title_summary_keywords',
       'title_summary']]
for a, b, c, d, e in temp.values[:10]:
    print(a)
    print(b)
    print(c)
    print(d)
    print(e)
    print('\n', '='*77, '\n')

It breaks my heart to see people living in those conditions. I hope that all the aid that was sent to the island makes it to the people who need it the most. I do not know what I would do it that was my family and I. I would hope that I would do my best, but I can see how depressing and hopeless you could feel having your whole life changed because of a storm and not knowing where your next meal is coming from.
Empathy for Hurricane Victims and Hope for Aid Distribution. The text expresses sadness at seeing people living in poor conditions due to a storm, and hopes that aid reaches those in need. The author empathizes with the affected individuals, acknowledging the potential feelings of depression and hopelessness that come with such a life-changing event. breaks my heart, people living, conditions, aid, island, need, family, depressing, hopeless, whole life changed, storm, next meal It breaks my heart to see people living in those conditions. I hope that all the aid that was sent to 

In [57]:
candidate_cols = [ 'essay_clean', 'essay_clean_tsk',
                   'essay_clean_ts', 'title_summary_keywords',
                   'title_summary', ]
text_col    = candidate_cols[3]
target_col  = 'distress'
print('Text column:', text_col)

Text column: title_summary_keywords


In [59]:
# for testing on training set
X_train = df_train[text_col].values
y_train = df_train[target_col].values

X_dev = df_dev[text_col].values
y_dev = df_dev[target_col].values

#X_train, y_train = sklearn.utils.shuffle( X_train, y_train, random_state=random_state, ) 
print( 'Shape of datasets: ', X_train.shape, y_train.shape, X_dev.shape, y_dev.shape, )

Shape of datasets:  (792,) (792,) (208,) (208,)


# Train

In [11]:
clf_params_rf = {
    'n_estimators': 100,
    'criterion': 'entropy',                         # “gini”, “entropy”
    'max_depth': None,
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'max_features': 'auto',                      # “auto”, “sqrt”, “log2”
    'class_weight': None,                        # dict, 'balanced', 'balanced_subsample', None
    'random_state': random_state,
    'n_jobs': -1,
}

In [60]:
clf_params_xgb_word = {
    'n_estimators': 145,
    'max_depth': 6,
    'learning_rate': 0.3,    # 0.3 is close too          # eta
    'objective': 'reg:squarederror',                         # multi:softmax, multi:softprob, rank:pairwise
    'eval_metric': 'merror',                              # multiclass - merror, mlogloss
    'base_score': 0.5,
    'booster': 'gbtree',                                  # gbtree, dart
    'tree_method': 'auto',                                # auto, exact, approx, hist and gpu_hist
    'importance_type': 'gain',                            # default“gain”,“weight”,“cover”,“total_gain”,“total_cover”
    'gamma': 0,                                           # larger - more conservative, [0, inf]
    'reg_alpha': 0,                                       # L1 reg., larger - more conservative
    'reg_lambda': 1,                                      # L2 rreg., larger - more conservative
    'sampling_method': 'uniform',                         # uniform, gradient_based
    'max_delta_step': 1,                                  # 1-10
    'min_child_weight': 1,
    'subsample': 0.7,           # 0.7                                # 0-1    
    'colsample_bylevel': 0.28,   #0.28 (0.5342)                  # 0-1
    'colsample_bynode': 1.0,  #0.28 (0.5342, thres=0.26)      # optimized for higher recall
    'colsample_bytree': 1.0,                                    # 0-1  
    'seed': 2,
    #'use_label_encoder': False,
    'random_state': random_state,
    'n_jobs': -1,    
}

vect_params = {
    'max_df': 1.0,
    'min_df': 1,
    'analyzer': 'word',
    'ngram_range': (1,3),
    'binary': True,
    'stop_words': stopwords_combined,
}

In [61]:
#clf = RandomForestClassifier( **clf_params_rf )
clf = XGBRegressor( **clf_params_xgb_word )

In [62]:
#vectorizer = TfidfVectorizer( **vect_params )
vectorizer = CountVectorizer( **vect_params )
model       = Pipeline( steps=[('vect', vectorizer), ('clf', clf)] )
model.fit(X_train, y_train)

In [63]:
df_train['distress_pred_regressor'] = model.predict(X_train)
df_dev['distress_pred_regressor']   = model.predict(X_dev)
print( df_train[target_col].corr(df_train['distress_pred_regressor'], method='pearson'),
       df_dev[target_col].corr(df_dev['distress_pred_regressor'], method='pearson') )

0.9856166305669771 0.22630210756004965


## APPENDIX

In [67]:
res = []
#params1 = [i/100 for i in range(2,101,2)] + [i for i in range(100,801,25)]
#params1 = [i/100 for i in range(2,101,2)]
#params1 = [i for i in range(25,401,25)]
params1 = [1,2,3,4,5,6,7,8,9,10,11,15]
#params1 = [7,8,9,10,11,12,14,15]
#params1 = [1]

for param1 in params1:
    clf_params_xgb_word2 = {
        'n_estimators': 100,
        'max_depth': 6,          # 3 - 0.5489
        'learning_rate': 0.4,    #                            # eta
        'objective': 'reg:squarederror',  
        'eval_metric': 'merror',                              # multiclass - merror, mlogloss
        'base_score': 0.5,
        'booster': 'gbtree',                                  # gbtree, dart
        'tree_method': 'auto',                                # auto, exact, approx, hist and gpu_hist
        'importance_type': 'gain',                            # default“gain”,“weight”,“cover”,“total_gain”,“total_cover”
        'gamma': 0,                                           # larger - more conservative, [0, inf]
        'reg_alpha': 0.02,                                       # L1 reg., larger - more conservative
        'reg_lambda': 1.0,                                      # L2 rreg., larger - more conservative
        'sampling_method': 'uniform',                         # uniform, gradient_based
        'max_delta_step': 1,                                  # 1-10
        'min_child_weight': 1,
        'subsample': 1.0,           # 0.9  (0.5638, thres 0.21)     # 0-1    
        'colsample_bylevel': 1.0,   #0.55 (0.5741, thres 0.25)     # 0-1
        'colsample_bynode': 1.0,                                    # optimized for higher recall
        'colsample_bytree': 1.0,                                    # 0-1  
        'seed': 2,
        'random_state': random_state,
        'n_jobs': -1,    
}

    vect_params2 = {
        'max_df': 0.95,
        'min_df': 3,
        'analyzer': 'char_wb',
        'ngram_range': (1,5),
        'binary': True,
        'stop_words': stopwords_combined,
    }

    clf        = XGBRegressor( **clf_params_xgb_word2 )
    vectorizer = CountVectorizer( **vect_params2 )
    #vectorizer = TfidfVectorizer( **vect_params2 )
    model      = Pipeline( steps=[('vect', vectorizer), ('clf', clf)] )
    model.fit(X_train, y_train)

    df_train['distress_pred_regressor'] = model.predict(X_train)
    df_dev['distress_pred_regressor']   = model.predict(X_dev)
    pearson_train = df_train[target_col].corr(df_train['distress_pred_regressor'], method='pearson')
    pearson_dev   = df_dev[target_col].corr(df_dev['distress_pred_regressor'], method='pearson')
    print('Dev Pearson:', pearson_dev)
    print('Train Pearson:', pearson_train)

    res.append(( pearson_dev, pearson_train, param1 ))
    print('Param:', param1)
    print('\nBest Dev Pearson:', round(sorted(res, key=lambda x: x[0], reverse=True)[0][0], 4) )
    print('\n', '='*77, '\n', sep='')   

Dev Pearson: 0.3296804860576358
Train Pearson: 0.9996773582290357
Param: 1

Best Dev Pearson: 0.3297


Dev Pearson: 0.3275640909912672
Train Pearson: 0.9997305188634159
Param: 2

Best Dev Pearson: 0.3297


Dev Pearson: 0.3577123433156057
Train Pearson: 0.9998074209934859
Param: 3

Best Dev Pearson: 0.3577


Dev Pearson: 0.28988069321925447
Train Pearson: 0.9998880231808684
Param: 4

Best Dev Pearson: 0.3577


Dev Pearson: 0.26799356952017594
Train Pearson: 0.9998806123704213
Param: 5

Best Dev Pearson: 0.3577


Dev Pearson: 0.2654923819492749
Train Pearson: 0.9999224046462216
Param: 6

Best Dev Pearson: 0.3577


Dev Pearson: 0.2135686052765268
Train Pearson: 0.9999660898271479
Param: 7

Best Dev Pearson: 0.3577


Dev Pearson: 0.31618665998300477
Train Pearson: 0.999945652726585
Param: 8

Best Dev Pearson: 0.3577


Dev Pearson: 0.2884393498746915
Train Pearson: 0.9999707348013293
Param: 9

Best Dev Pearson: 0.3577


Dev Pearson: 0.27268139825228843
Train Pearson: 0.9999682489256385
Para

In [68]:
for i in sorted(res, key=lambda x: x[0], reverse=True):
    print(i)

(0.3577123433156057, 0.9998074209934859, 3)
(0.3296804860576358, 0.9996773582290357, 1)
(0.3275640909912672, 0.9997305188634159, 2)
(0.31618665998300477, 0.999945652726585, 8)
(0.28988069321925447, 0.9998880231808684, 4)
(0.2884393498746915, 0.9999707348013293, 9)
(0.27268139825228843, 0.9999682489256385, 10)
(0.26799356952017594, 0.9998806123704213, 5)
(0.26676618558385573, 0.9999811434520202, 11)
(0.2654923819492749, 0.9999224046462216, 6)
(0.251629830747169, 0.9999913274280439, 15)
(0.2135686052765268, 0.9999660898271479, 7)


# Best results

'essay_clean'  
Best Dev Pearson: 0.4421

In [None]:
for param1 in params1:
    clf_params_xgb_word2 = {
        'n_estimators': 25,
        'max_depth': 6,          # 3 - 0.5489
        'learning_rate': 0.4,    #                            # eta
        'objective': 'reg:squarederror',  
        'eval_metric': 'merror',                              # multiclass - merror, mlogloss
        'base_score': 0.5,
        'booster': 'gbtree',                                  # gbtree, dart
        'tree_method': 'auto',                                # auto, exact, approx, hist and gpu_hist
        'importance_type': 'gain',                            # default“gain”,“weight”,“cover”,“total_gain”,“total_cover”
        'gamma': 0,                                           # larger - more conservative, [0, inf]
        'reg_alpha': 0.02,                                       # L1 reg., larger - more conservative
        'reg_lambda': 1.0,                                      # L2 rreg., larger - more conservative
        'sampling_method': 'uniform',                         # uniform, gradient_based
        'max_delta_step': 1,                                  # 1-10
        'min_child_weight': 1,
        'subsample': 1.0,           # 0.9  (0.5638, thres 0.21)     # 0-1    
        'colsample_bylevel': 1.0,   #0.55 (0.5741, thres 0.25)     # 0-1
        'colsample_bynode': 1.0,                                    # optimized for higher recall
        'colsample_bytree': 1.0,                                    # 0-1  
        'seed': 2,
        'random_state': random_state,
        'n_jobs': -1,    
}

    vect_params2 = {
        'max_df': 0.95,
        'min_df': 3,
        'analyzer': 'char_wb',
        'ngram_range': (1,7),
        'binary': True,
        'stop_words': stopwords_combined,
    }