In [1]:
import sys
sys.path.append('../')

In [2]:
from utils.utils import Utils
from utils.preprocess import Preprocess
from scipy.stats import pearsonr
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder, OneHotEncoder
from textblob import TextBlob

import numpy as np
import pandas as pd

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/eastwind/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/eastwind/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/eastwind/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/eastwind/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to /home/eastwind/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package brown to /home/eastwind/nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [3]:
%%time
utils = Utils()
pre = Preprocess(mode="clean")

CPU times: user 2.98 s, sys: 108 ms, total: 3.09 s
Wall time: 3.1 s


In [39]:
%%time
train_data = utils.read_data("../dataset/train/train-empathy-distress-prediction-task-normalized-Empath.csv")
print(train_data.shape)
dev_data = utils.read_data("../dataset/dev/dev-empathy-distress-prediction-task-normalized-Empath.csv")
print(dev_data.shape)

(1860, 224)
(270, 219)
CPU times: user 98.7 ms, sys: 7.91 ms, total: 107 ms
Wall time: 106 ms


In [5]:
def categorize_income(income):
    if income >0 and income <=25000:
        return 0
    elif income >25000 and income <=75000:
        return 1
    elif income >75000:
        return 2

In [6]:
train_data['age_bin'] = train_data.age.apply(lambda x: utils.categorize_age(x))
dev_data['age_bin'] = dev_data.age.apply(lambda x: utils.categorize_age(x))

In [7]:
train_data['income_bin'] = train_data.income.apply(lambda x: categorize_income(x))
dev_data['income_bin'] = dev_data.income.apply(lambda x: categorize_income(x))

In [8]:
train_data[train_data.gold_distress_bin == 1].gold_emotion.value_counts()

sadness     353
anger       161
neutral     123
fear         95
disgust      84
surprise     63
joy          26
Name: gold_emotion, dtype: int64

In [9]:
train_empathy = train_data.gold_empathy.values.tolist()
dev_empathy = dev_data.gold_empathy.values.tolist()

In [10]:
train_distress = train_data.gold_distress.values.tolist()
dev_distress = dev_data.gold_distress.values.tolist()

In [12]:
# ----------------------------------- Function to get word emotion and vad scores -----------------------------------
            
def get_word_scores(sentiment='anger'):
    path = "../resources/NRC-resources/"+sentiment+"-scores.txt"
    word_scores = {}
    with open(path) as f:
        data = f.readlines()
        for row in data:
            row = row.split("\t")
            row[1] = row[1].split("\n")
            word_scores[row[0]] = row[1][0]
    f.close()
    return word_scores

In [13]:
# -------------------------------------------- Function to get essay emotion and vad scores --------------------------------------------
    
def get_essay_nrc_scores(essay, nrc_features, normalize=False):
    word_scores_list = []
    for element in nrc_features:
        word_scores = get_word_scores(element)
        word_scores_list.append(word_scores)
    essay_scores = np.zeros((len(essay), len(word_scores_list)))

    for i in range(len(essay)):        
        for j in range(len(word_scores_list)):
            score = 0
            cnt = 0
            for word in essay[i].split():
                if word in word_scores_list[j].keys():
                    cnt = cnt = 1
                    score = score + float(word_scores_list[j].get(word))
            essay_scores[i][j] = score

    return essay_scores

In [14]:
def column(matrix, i):
    return [row[i] for row in matrix]

# Polarity and Subjectivity Correlation

In [15]:
train_pol_sub_score = [TextBlob(text).sentiment[0:] for text in train_data.essay.values.tolist()]
train_pol_sub_score = np.reshape(train_pol_sub_score, (len(train_pol_sub_score), len(train_pol_sub_score[0])))
print(train_pol_sub_score.shape)

dev_pol_sub_score = [TextBlob(text).sentiment[0:] for text in dev_data.essay.values.tolist()]
dev_pol_sub_score = np.reshape(dev_pol_sub_score, (len(dev_pol_sub_score), len(dev_pol_sub_score[0])))
print(dev_pol_sub_score.shape)

(1860, 2)
(270, 2)


In [16]:
print("\nEmpathy Correlation with Training Data: \n")
for i in range(train_pol_sub_score.shape[1]):
    print("correlation: {}".format(pearsonr(column(train_pol_sub_score, i), train_empathy)))

print("\nEmpathy Correlation with Dev Data: \n")
for i in range(dev_pol_sub_score.shape[1]):
    print("correlation: {}".format(pearsonr(column(dev_pol_sub_score, i), dev_empathy)))


Empathy Correlation with Training Data: 

correlation: (0.002028890025075345, 0.9303196594201143)
correlation: (-0.010672759257810048, 0.6455195130362161)

Empathy Correlation with Dev Data: 

correlation: (-0.07930419859054999, 0.19390887777637755)
correlation: (0.08903099209462476, 0.1445540775339315)


In [17]:
print("\nDistress Correlation with Training Data: \n")
for i in range(train_pol_sub_score.shape[1]):
    print("correlation: {}".format(pearsonr(column(train_pol_sub_score, i), train_distress)))

print("\nDistress Correlation with Dev Data: \n")
for i in range(dev_pol_sub_score.shape[1]):
    print("correlation: {}".format(pearsonr(column(dev_pol_sub_score, i), dev_distress)))


Distress Correlation with Training Data: 

correlation: (-0.12737750226577563, 3.5422518051580214e-08)
correlation: (0.009894647225671899, 0.6697747907208982)

Distress Correlation with Dev Data: 

correlation: (-0.1855684857695902, 0.002200909946092545)
correlation: (0.12635713564059836, 0.03799112591091679)


# NRC features

In [18]:
def get_feature_correlation(feature_list, train_empathy, dev_empathy, train_df, dev_df, remove_stopwords=False, lemmatize=False):
    scaler = StandardScaler()
    
    train_essay = [pre.clean_text(text, remove_stopwords=remove_stopwords, lemmatize=lemmatize) for text in train_df.essay.values.tolist()]
    train_emotion_scores = get_essay_nrc_scores(train_essay,
                                                feature_list, 
                                                normalize=False)
    train_emotion_scores = scaler.fit_transform(train_emotion_scores)
    
    print("\nResult on Training Data: ")
    for i in range(train_emotion_scores.shape[1]):
        print("NRC feature {} correlation: {}".format(feature_list[i], 
                                                      pearsonr(column(train_emotion_scores, i), train_empathy)))
    
    dev_essay = [pre.clean_text(text, remove_stopwords=remove_stopwords, lemmatize=lemmatize) for text in dev_df.essay.values.tolist()]
    dev_emotion_scores = get_essay_nrc_scores(dev_essay,
                                              feature_list, 
                                              normalize=False)
    dev_emotion_scores = scaler.transform(dev_emotion_scores)
    print("\nResult on Dev Data: ")
    for i in range(dev_emotion_scores.shape[1]):
        print("NRC feature {} correlation: {}".format(feature_list[i],
                                                      pearsonr(column(dev_emotion_scores, i), dev_empathy)))

In [19]:
nrc_features = ['anger', 
                'anticipation',
                'disgust',
                'fear',
                'joy',
                'sadness',
                'surprise',
                'trust',
                'valence',
                'arousal',
                'dominance']

# Empathy Correlation with NRC features

In [20]:
get_feature_correlation(nrc_features,
                        train_empathy,
                        dev_empathy,
                        train_data,
                        dev_data,
                        remove_stopwords=False,
                        lemmatize=False)


Result on Training Data: 
NRC feature anger correlation: (0.03371132328207759, 0.14613267742304598)
NRC feature anticipation correlation: (0.05052370744909695, 0.02933917623770778)
NRC feature disgust correlation: (0.032294903152194805, 0.16385207180553704)
NRC feature fear correlation: (0.11596459059165252, 5.307402854588614e-07)
NRC feature joy correlation: (0.1228213505507391, 1.0749676819732623e-07)
NRC feature sadness correlation: (0.18661023104982843, 4.896526938008344e-16)
NRC feature surprise correlation: (-0.02480991051561218, 0.28487140287886126)
NRC feature trust correlation: (0.033288508750313604, 0.15126145732326135)
NRC feature valence correlation: (0.111294507933194, 1.4972235610452352e-06)
NRC feature arousal correlation: (0.10663657137946016, 4.045182576425108e-06)
NRC feature dominance correlation: (0.10246084866879016, 9.529240440432246e-06)

Result on Dev Data: 
NRC feature anger correlation: (0.030141210681472047, 0.6219526250779049)
NRC feature anticipation corre

# Distress Correlation with NRC features

In [21]:
get_feature_correlation(nrc_features,
                        train_distress,
                        dev_distress,
                        train_data,
                        dev_data,
                        remove_stopwords=False,
                        lemmatize=False)


Result on Training Data: 
NRC feature anger correlation: (0.15691333193688098, 1.0107816294838731e-11)
NRC feature anticipation correlation: (0.04869747876136885, 0.03572486677919946)
NRC feature disgust correlation: (0.14419068588038367, 4.181270882655846e-10)
NRC feature fear correlation: (0.21475620944306706, 7.591040906128478e-21)
NRC feature joy correlation: (0.016936513684404398, 0.4653935491898295)
NRC feature sadness correlation: (0.20076505497813893, 2.292849778397807e-18)
NRC feature surprise correlation: (0.0007436515163227452, 0.9744318291254607)
NRC feature trust correlation: (0.0440947180964118, 0.05725606613530883)
NRC feature valence correlation: (0.09074788520012098, 8.884086764387025e-05)
NRC feature arousal correlation: (0.15419420396826447, 2.299907854295611e-11)
NRC feature dominance correlation: (0.12536125228786218, 5.817526328010011e-08)

Result on Dev Data: 
NRC feature anger correlation: (0.24590457627622556, 4.411975783772861e-05)
NRC feature anticipation co

# IRI and Personality Features

In [22]:
iri_features = ['iri_perspective_taking', 'iri_personal_distress', 'iri_fantasy', 'iri_empathatic_concern']
personality_features = ['personality_conscientiousness', 'personality_openess', 'personality_extraversion', 'personality_agreeableness', 'personality_stability']

In [23]:
train_iri = train_data[iri_features].values
print(train_iri.shape)

dev_iri = dev_data[iri_features].values
print(dev_iri.shape)

(1860, 4)
(270, 4)


In [24]:
train_personality = train_data[personality_features].values
print(train_personality.shape)

dev_personality = dev_data[personality_features].values
print(dev_personality.shape)

(1860, 5)
(270, 5)


# Empathy Correlation with IRI Features

In [25]:
print("\nEmpathy correlation on Training Data: \n")
for i in range(train_iri.shape[1]):
    print("NRC feature {} correlation: {}".format(iri_features[i], 
                                                  pearsonr(column(train_iri, i), train_empathy)))
    
print("\nEmpathy correlation on Dev Data: \n")
for i in range(dev_iri.shape[1]):
    print("NRC feature {} correlation: {}".format(iri_features[i], 
                                                  pearsonr(column(dev_iri, i), dev_empathy)))


Empathy correlation on Training Data: 

NRC feature iri_perspective_taking correlation: (0.22856663040279537, 1.8023870796887119e-23)
NRC feature iri_personal_distress correlation: (0.005806115549177551, 0.8024030801218004)
NRC feature iri_fantasy correlation: (0.161711922593472, 2.2863423583074164e-12)
NRC feature iri_empathatic_concern correlation: (0.29936333439833496, 8.080200148943306e-40)

Empathy correlation on Dev Data: 

NRC feature iri_perspective_taking correlation: (0.20298941835957596, 0.0007937146200273498)
NRC feature iri_personal_distress correlation: (0.04738842311180666, 0.43804758577495945)
NRC feature iri_fantasy correlation: (0.15504622031303658, 0.010732314263238379)
NRC feature iri_empathatic_concern correlation: (0.24955210765314614, 3.363615805989282e-05)


# Empathy Correlation with Big5 Features

In [26]:
print("\nEmpathy correlation on Training Data: \n")
for i in range(train_personality.shape[1]):
    print("NRC feature {} correlation: {}".format(personality_features[i], 
                                                  pearsonr(column(train_personality, i), train_empathy)))
    
print("\nEmpathy correlation on Dev Data: \n")
for i in range(dev_personality.shape[1]):
    print("NRC feature {} correlation: {}".format(personality_features[i], 
                                                  pearsonr(column(dev_personality, i), dev_empathy)))


Empathy correlation on Training Data: 

NRC feature personality_conscientiousness correlation: (0.09263106209527383, 6.31060186467026e-05)
NRC feature personality_openess correlation: (0.11377269271127469, 8.679270957892684e-07)
NRC feature personality_extraversion correlation: (0.20902561681934412, 8.272935808223537e-20)
NRC feature personality_agreeableness correlation: (0.24325722930340982, 1.8525600705176216e-26)
NRC feature personality_stability correlation: (0.1037886896228663, 7.281898804278378e-06)

Empathy correlation on Dev Data: 

NRC feature personality_conscientiousness correlation: (0.05585836172304273, 0.36055805782871764)
NRC feature personality_openess correlation: (0.20788348445686353, 0.0005867699857419374)
NRC feature personality_extraversion correlation: (-0.07142000579522055, 0.24216201875575047)
NRC feature personality_agreeableness correlation: (0.2388727885280508, 7.357726509917349e-05)
NRC feature personality_stability correlation: (-0.07024350772260109, 0.25

# Distress Correlation with IRI Features

In [27]:
print("\nDistress correlation on Training Data: \n")
for i in range(train_iri.shape[1]):
    print("NRC feature {} correlation: {}".format(iri_features[i], 
                                                  pearsonr(column(train_iri, i), train_distress)))
    
print("\nDistress correlation on Dev Data: \n")
for i in range(dev_iri.shape[1]):
    print("NRC feature {} correlation: {}".format(iri_features[i], 
                                                  pearsonr(column(dev_iri, i), dev_distress)))


Distress correlation on Training Data: 

NRC feature iri_perspective_taking correlation: (0.1265242467033694, 4.373892590075604e-08)
NRC feature iri_personal_distress correlation: (0.18952486622501458, 1.678101271410058e-16)
NRC feature iri_fantasy correlation: (0.1205412837507924, 1.84614008033545e-07)
NRC feature iri_empathatic_concern correlation: (0.198644782586168, 5.256970035825554e-18)

Distress correlation on Dev Data: 

NRC feature iri_perspective_taking correlation: (0.17102091893068, 0.00483370001736317)
NRC feature iri_personal_distress correlation: (0.3755251101638156, 1.808488933353831e-10)
NRC feature iri_fantasy correlation: (0.33916964288043544, 1.0801312513148551e-08)
NRC feature iri_empathatic_concern correlation: (0.23107911176029355, 0.00012742451421095185)


# Distress Correlation with Big5 Features

In [28]:
print("\nDistress correlation on Training Data: \n")
for i in range(train_personality.shape[1]):
    print("NRC feature {} correlation: {}".format(personality_features[i], 
                                                  pearsonr(column(train_personality, i), train_distress)))
    
print("\nDistress correlation on Dev Data: \n")
for i in range(dev_personality.shape[1]):
    print("NRC feature {} correlation: {}".format(personality_features[i], 
                                                  pearsonr(column(dev_personality, i), dev_distress)))


Distress correlation on Training Data: 

NRC feature personality_conscientiousness correlation: (-0.05153061053217745, 0.026257626702303652)
NRC feature personality_openess correlation: (0.014207526470026804, 0.5403014682871335)
NRC feature personality_extraversion correlation: (0.12405604614928988, 7.98785639534921e-08)
NRC feature personality_agreeableness correlation: (0.1265897305069749, 4.303881683442054e-08)
NRC feature personality_stability correlation: (-0.0837787455455481, 0.0002979021594045739)

Distress correlation on Dev Data: 

NRC feature personality_conscientiousness correlation: (-0.09324355779674652, 0.12642042352162314)
NRC feature personality_openess correlation: (0.10025728826674211, 0.10019756302891678)
NRC feature personality_extraversion correlation: (-0.25778449051654345, 1.7957123002921912e-05)
NRC feature personality_agreeableness correlation: (0.088025115396524, 0.1491638812272802)
NRC feature personality_stability correlation: (-0.35599698930373724, 1.73465

# High Empathy Analysis

In [51]:
he_train_data = train_data[train_data.gold_empathy_bin == 0].copy(deep=True)
print(he_train_data.shape)
he_dev_data = dev_data[dev_data.gold_empathy_bin == 0].copy(deep=True)
print(he_dev_data.shape)

he_train_empathy = he_train_data.gold_empathy.values
he_dev_empathy = he_dev_data.gold_empathy.values

(944, 224)
(150, 219)
