In [1]:
import sys
sys.path.append('../')

In [2]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr

from utils.utils import Utils
from utils.preprocess import Preprocess

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/eastwind/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/eastwind/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/eastwind/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/eastwind/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to /home/eastwind/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package brown to /home/eastwind/nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [3]:
%%time
utils = Utils()
pre = Preprocess(mode="clean")

CPU times: user 4.17 s, sys: 219 ms, total: 4.39 s
Wall time: 7.95 s


In [4]:
def compute_correlation(y_true, y_pred):
        y_pred = y_pred.flatten()
        y_true = y_true.flatten()
        return pearsonr(y_true, y_pred)

In [5]:
train_data = utils.read_data("../dataset/train/train-final.csv")
train_data.shape

(1860, 317)

In [7]:
train_essay = [pre.clean_text(text, remove_stopwords=False, lemmatize=True) for text in train_data.essay.values.tolist()]

In [10]:
train_empathy = np.reshape(train_data.gold_empathy.values.tolist(), (len(train_data.gold_empathy.values.tolist()), 1))
train_empathy.shape

(1860, 1)

In [8]:
dev_data = utils.read_data("../dataset/dev/dev-final.csv")
print(dev_data.shape)

(270, 304)


In [9]:
dev_essay = [pre.clean_text(text, remove_stopwords=False, lemmatize=True) for text in dev_data.essay.values.tolist()]

In [16]:
dev_empathy = np.reshape(dev_data.gold_empathy.values.tolist(), (len(dev_data.gold_empathy.values.tolist()), 1))
dev_empathy.shape

(270, 1)

# Normal Weights

In [6]:
word_weights = utils.get_dict("../resources/word-weights/empathy_word_weights.csv",
                             key_column="words",
                             value_column="weights")
len(word_weights)

754661

## Training Data

In [11]:
train_essay_empathy_scores = utils.get_essay_empathy_distress_scores(train_essay,
                                                                     word_weights)
train_essay_empathy_scores.shape

(1860, 1)

In [12]:
train_essay_empathy_scores[:10]

array([[ 4.51741859],
       [ 9.12979662],
       [ 5.47535553],
       [ 1.29772833],
       [ 5.03349346],
       [-4.2938398 ],
       [-7.89261074],
       [-6.1939338 ],
       [-7.85614902],
       [-0.03071311]])

In [13]:
train_cor = compute_correlation(train_empathy, train_essay_empathy_scores)
train_cor

(0.6738534162770651, 1.7081328565563806e-246)

## Dev Data

In [14]:
dev_essay_empathy_scores = utils.get_essay_empathy_distress_scores(dev_essay,
                                                                   word_weights)
dev_essay_empathy_scores.shape

(270, 1)

In [15]:
dev_essay_empathy_scores[:10]

array([[ 3.75594899],
       [ 2.34015965],
       [ 0.70795214],
       [-0.52954169],
       [ 0.66247737],
       [ 2.38108276],
       [ 3.82768005],
       [-0.12451278],
       [-1.19437408],
       [ 2.40971474]])

In [17]:
dev_empathy = np.reshape(dev_data.gold_empathy.values.tolist(), (len(dev_data.gold_empathy.values.tolist()), 1))
dev_empathy.shape

(270, 1)

In [18]:
dev_cor = compute_correlation(dev_empathy, dev_essay_empathy_scores)
dev_cor

(0.3187895974642142, 8.589274526574551e-08)

# Tan-Inverse Weights

In [19]:
arctan_word_weights = utils.get_dict("../resources/word-weights/empathy_word_weights.csv",
                             key_column="words",
                             value_column="weights-arctan")
len(arctan_word_weights)

754661

## Training Data

In [20]:
train_essay_empathy_scores = utils.get_essay_empathy_distress_scores(train_essay,
                                                                     arctan_word_weights)
train_essay_empathy_scores.shape

(1860, 1)

In [21]:
train_essay_empathy_scores[:10]

array([[ 3.84971164],
       [ 7.74371554],
       [ 4.44371217],
       [ 1.11992745],
       [ 4.44836726],
       [-3.52856426],
       [-6.56782751],
       [-5.0720323 ],
       [-6.50202727],
       [ 0.34375441]])

In [22]:
train_cor = compute_correlation(train_empathy, train_essay_empathy_scores)
train_cor

(0.6761280367512145, 9.021502274370438e-249)

## Dev Data

In [23]:
dev_essay_empathy_scores = utils.get_essay_empathy_distress_scores(dev_essay,
                                                                   arctan_word_weights)
dev_essay_empathy_scores.shape

(270, 1)

In [24]:
dev_essay_empathy_scores[:10]

array([[ 3.47335787],
       [ 2.28301139],
       [ 0.76380566],
       [-0.35681684],
       [ 0.67366443],
       [ 1.92135565],
       [ 3.57333348],
       [-0.29729518],
       [-0.87295543],
       [ 2.17300778]])

In [25]:
dev_cor = compute_correlation(dev_empathy, dev_essay_empathy_scores)
dev_cor

(0.3347960195287625, 1.7072886648754218e-08)